Pileus Git - ~andy/linux/blob - mm/memcontrol.c

   1 /* memcontrol.c - Memory Controller
   2  *
   3  * Copyright IBM Corporation, 2007
   4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5  *
   6  * Copyright 2007 OpenVZ SWsoft Inc
   7  * Author: Pavel Emelianov <xemul@openvz.org>
   8  *
   9  * Memory thresholds
  10  * Copyright (C) 2009 Nokia Corporation
  11  * Author: Kirill A. Shutemov
  12  *
  13  * Kernel Memory Controller
  14  * Copyright (C) 2012 Parallels Inc. and Google Inc.
  15  * Authors: Glauber Costa and Suleiman Souhlal
  16  *
  17  * This program is free software; you can redistribute it and/or modify
  18  * it under the terms of the GNU General Public License as published by
  19  * the Free Software Foundation; either version 2 of the License, or
  20  * (at your option) any later version.
  21  *
  22  * This program is distributed in the hope that it will be useful,
  23  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  24  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  25  * GNU General Public License for more details.
  26  */
  27
  28 #include <linux/res_counter.h>
  29 #include <linux/memcontrol.h>
  30 #include <linux/cgroup.h>
  31 #include <linux/mm.h>
  32 #include <linux/hugetlb.h>
  33 #include <linux/pagemap.h>
  34 #include <linux/smp.h>
  35 #include <linux/page-flags.h>
  36 #include <linux/backing-dev.h>
  37 #include <linux/bit_spinlock.h>
  38 #include <linux/rcupdate.h>
  39 #include <linux/limits.h>
  40 #include <linux/export.h>
  41 #include <linux/mutex.h>
  42 #include <linux/slab.h>
  43 #include <linux/swap.h>
  44 #include <linux/swapops.h>
  45 #include <linux/spinlock.h>
  46 #include <linux/eventfd.h>
  47 #include <linux/sort.h>
  48 #include <linux/fs.h>
  49 #include <linux/seq_file.h>
  50 #include <linux/vmalloc.h>
  51 #include <linux/vmpressure.h>
  52 #include <linux/mm_inline.h>
  53 #include <linux/page_cgroup.h>
  54 #include <linux/cpu.h>
  55 #include <linux/oom.h>
  56 #include "internal.h"
  57 #include <net/sock.h>
  58 #include <net/ip.h>
  59 #include <net/tcp_memcontrol.h>
  60
  61 #include <asm/uaccess.h>
  62
  63 #include <trace/events/vmscan.h>
  64
  65 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
  66 EXPORT_SYMBOL(mem_cgroup_subsys);
  67
  68 #define MEM_CGROUP_RECLAIM_RETRIES      5
  69 static struct mem_cgroup *root_mem_cgroup __read_mostly;
  70
  71 #ifdef CONFIG_MEMCG_SWAP
  72 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
  73 int do_swap_account __read_mostly;
  74
  75 /* for remember boot option*/
  76 #ifdef CONFIG_MEMCG_SWAP_ENABLED
  77 static int really_do_swap_account __initdata = 1;
  78 #else
  79 static int really_do_swap_account __initdata = 0;
  80 #endif
  81
  82 #else
  83 #define do_swap_account         0
  84 #endif
  85
  86
  87 static const char * const mem_cgroup_stat_names[] = {
  88         "cache",
  89         "rss",
  90         "rss_huge",
  91         "mapped_file",
  92         "writeback",
  93         "swap",
  94 };
  95
  96 enum mem_cgroup_events_index {
  97         MEM_CGROUP_EVENTS_PGPGIN,       /* # of pages paged in */
  98         MEM_CGROUP_EVENTS_PGPGOUT,      /* # of pages paged out */
  99         MEM_CGROUP_EVENTS_PGFAULT,      /* # of page-faults */
 100         MEM_CGROUP_EVENTS_PGMAJFAULT,   /* # of major page-faults */
 101         MEM_CGROUP_EVENTS_NSTATS,
 102 };
 103
 104 static const char * const mem_cgroup_events_names[] = {
 105         "pgpgin",
 106         "pgpgout",
 107         "pgfault",
 108         "pgmajfault",
 109 };
 110
 111 static const char * const mem_cgroup_lru_names[] = {
 112         "inactive_anon",
 113         "active_anon",
 114         "inactive_file",
 115         "active_file",
 116         "unevictable",
 117 };
 118
 119 /*
 120  * Per memcg event counter is incremented at every pagein/pageout. With THP,
 121  * it will be incremated by the number of pages. This counter is used for
 122  * for trigger some periodic events. This is straightforward and better
 123  * than using jiffies etc. to handle periodic memcg event.
 124  */
 125 enum mem_cgroup_events_target {
 126         MEM_CGROUP_TARGET_THRESH,
 127         MEM_CGROUP_TARGET_SOFTLIMIT,
 128         MEM_CGROUP_TARGET_NUMAINFO,
 129         MEM_CGROUP_NTARGETS,
 130 };
 131 #define THRESHOLDS_EVENTS_TARGET 128
 132 #define SOFTLIMIT_EVENTS_TARGET 1024
 133 #define NUMAINFO_EVENTS_TARGET  1024
 134
 135 struct mem_cgroup_stat_cpu {
 136         long count[MEM_CGROUP_STAT_NSTATS];
 137         unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
 138         unsigned long nr_page_events;
 139         unsigned long targets[MEM_CGROUP_NTARGETS];
 140 };
 141
 142 struct mem_cgroup_reclaim_iter {
 143         /*
 144          * last scanned hierarchy member. Valid only if last_dead_count
 145          * matches memcg->dead_count of the hierarchy root group.
 146          */
 147         struct mem_cgroup *last_visited;
 148         unsigned long last_dead_count;
 149
 150         /* scan generation, increased every round-trip */
 151         unsigned int generation;
 152 };
 153
 154 /*
 155  * per-zone information in memory controller.
 156  */
 157 struct mem_cgroup_per_zone {
 158         struct lruvec           lruvec;
 159         unsigned long           lru_size[NR_LRU_LISTS];
 160
 161         struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
 162
 163         struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
 164                                                 /* use container_of        */
 165 };
 166
 167 struct mem_cgroup_per_node {
 168         struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 169 };
 170
 171 struct mem_cgroup_threshold {
 172         struct eventfd_ctx *eventfd;
 173         u64 threshold;
 174 };
 175
 176 /* For threshold */
 177 struct mem_cgroup_threshold_ary {
 178         /* An array index points to threshold just below or equal to usage. */
 179         int current_threshold;
 180         /* Size of entries[] */
 181         unsigned int size;
 182         /* Array of thresholds */
 183         struct mem_cgroup_threshold entries[0];
 184 };
 185
 186 struct mem_cgroup_thresholds {
 187         /* Primary thresholds array */
 188         struct mem_cgroup_threshold_ary *primary;
 189         /*
 190          * Spare threshold array.
 191          * This is needed to make mem_cgroup_unregister_event() "never fail".
 192          * It must be able to store at least primary->size - 1 entries.
 193          */
 194         struct mem_cgroup_threshold_ary *spare;
 195 };
 196
 197 /* for OOM */
 198 struct mem_cgroup_eventfd_list {
 199         struct list_head list;
 200         struct eventfd_ctx *eventfd;
 201 };
 202
 203 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 204 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 205
 206 /*
 207  * The memory controller data structure. The memory controller controls both
 208  * page cache and RSS per cgroup. We would eventually like to provide
 209  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 210  * to help the administrator determine what knobs to tune.
 211  *
 212  * TODO: Add a water mark for the memory controller. Reclaim will begin when
 213  * we hit the water mark. May be even add a low water mark, such that
 214  * no reclaim occurs from a cgroup at it's low water mark, this is
 215  * a feature that will be implemented much later in the future.
 216  */
 217 struct mem_cgroup {
 218         struct cgroup_subsys_state css;
 219         /*
 220          * the counter to account for memory usage
 221          */
 222         struct res_counter res;
 223
 224         /* vmpressure notifications */
 225         struct vmpressure vmpressure;
 226
 227         /*
 228          * the counter to account for mem+swap usage.
 229          */
 230         struct res_counter memsw;
 231
 232         /*
 233          * the counter to account for kernel memory usage.
 234          */
 235         struct res_counter kmem;
 236         /*
 237          * Should the accounting and control be hierarchical, per subtree?
 238          */
 239         bool use_hierarchy;
 240         unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
 241
 242         bool            oom_lock;
 243         atomic_t        under_oom;
 244         atomic_t        oom_wakeups;
 245
 246         int     swappiness;
 247         /* OOM-Killer disable */
 248         int             oom_kill_disable;
 249
 250         /* set when res.limit == memsw.limit */
 251         bool            memsw_is_minimum;
 252
 253         /* protect arrays of thresholds */
 254         struct mutex thresholds_lock;
 255
 256         /* thresholds for memory usage. RCU-protected */
 257         struct mem_cgroup_thresholds thresholds;
 258
 259         /* thresholds for mem+swap usage. RCU-protected */
 260         struct mem_cgroup_thresholds memsw_thresholds;
 261
 262         /* For oom notifier event fd */
 263         struct list_head oom_notify;
 264
 265         /*
 266          * Should we move charges of a task when a task is moved into this
 267          * mem_cgroup ? And what type of charges should we move ?
 268          */
 269         unsigned long move_charge_at_immigrate;
 270         /*
 271          * set > 0 if pages under this cgroup are moving to other cgroup.
 272          */
 273         atomic_t        moving_account;
 274         /* taken only while moving_account > 0 */
 275         spinlock_t      move_lock;
 276         /*
 277          * percpu counter.
 278          */
 279         struct mem_cgroup_stat_cpu __percpu *stat;
 280         /*
 281          * used when a cpu is offlined or other synchronizations
 282          * See mem_cgroup_read_stat().
 283          */
 284         struct mem_cgroup_stat_cpu nocpu_base;
 285         spinlock_t pcp_counter_lock;
 286
 287         atomic_t        dead_count;
 288 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
 289         struct tcp_memcontrol tcp_mem;
 290 #endif
 291 #if defined(CONFIG_MEMCG_KMEM)
 292         /* analogous to slab_common's slab_caches list. per-memcg */
 293         struct list_head memcg_slab_caches;
 294         /* Not a spinlock, we can take a lot of time walking the list */
 295         struct mutex slab_caches_mutex;
 296         /* Index in the kmem_cache->memcg_params->memcg_caches array */
 297         int kmemcg_id;
 298 #endif
 299
 300         int last_scanned_node;
 301 #if MAX_NUMNODES > 1
 302         nodemask_t      scan_nodes;
 303         atomic_t        numainfo_events;
 304         atomic_t        numainfo_updating;
 305 #endif
 306         /*
 307          * Protects soft_contributed transitions.
 308          * See mem_cgroup_update_soft_limit
 309          */
 310         spinlock_t soft_lock;
 311
 312         /*
 313          * If true then this group has increased parents' children_in_excess
 314          * when it got over the soft limit.
 315          * When a group falls bellow the soft limit, parents' children_in_excess
 316          * is decreased and soft_contributed changed to false.
 317          */
 318         bool soft_contributed;
 319
 320         /* Number of children that are in soft limit excess */
 321         atomic_t children_in_excess;
 322
 323         struct mem_cgroup_per_node *nodeinfo[0];
 324         /* WARNING: nodeinfo must be the last member here */
 325 };
 326
 327 static size_t memcg_size(void)
 328 {
 329         return sizeof(struct mem_cgroup) +
 330                 nr_node_ids * sizeof(struct mem_cgroup_per_node);
 331 }
 332
 333 /* internal only representation about the status of kmem accounting. */
 334 enum {
 335         KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
 336         KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
 337         KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
 338 };
 339
 340 /* We account when limit is on, but only after call sites are patched */
 341 #define KMEM_ACCOUNTED_MASK \
 342                 ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
 343
 344 #ifdef CONFIG_MEMCG_KMEM
 345 static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
 346 {
 347         set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 348 }
 349
 350 static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 351 {
 352         return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 353 }
 354
 355 static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
 356 {
 357         set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
 358 }
 359
 360 static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
 361 {
 362         clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
 363 }
 364
 365 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
 366 {
 367         /*
 368          * Our caller must use css_get() first, because memcg_uncharge_kmem()
 369          * will call css_put() if it sees the memcg is dead.
 370          */
 371         smp_wmb();
 372         if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
 373                 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
 374 }
 375
 376 static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
 377 {
 378         return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
 379                                   &memcg->kmem_account_flags);
 380 }
 381 #endif
 382
 383 /* Stuffs for move charges at task migration. */
 384 /*
 385  * Types of charges to be moved. "move_charge_at_immitgrate" and
 386  * "immigrate_flags" are treated as a left-shifted bitmap of these types.
 387  */
 388 enum move_type {
 389         MOVE_CHARGE_TYPE_ANON,  /* private anonymous page and swap of it */
 390         MOVE_CHARGE_TYPE_FILE,  /* file page(including tmpfs) and swap of it */
 391         NR_MOVE_TYPE,
 392 };
 393
 394 /* "mc" and its members are protected by cgroup_mutex */
 395 static struct move_charge_struct {
 396         spinlock_t        lock; /* for from, to */
 397         struct mem_cgroup *from;
 398         struct mem_cgroup *to;
 399         unsigned long immigrate_flags;
 400         unsigned long precharge;
 401         unsigned long moved_charge;
 402         unsigned long moved_swap;
 403         struct task_struct *moving_task;        /* a task moving charges */
 404         wait_queue_head_t waitq;                /* a waitq for other context */
 405 } mc = {
 406         .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 407         .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 408 };
 409
 410 static bool move_anon(void)
 411 {
 412         return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
 413 }
 414
 415 static bool move_file(void)
 416 {
 417         return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
 418 }
 419
 420 /*
 421  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 422  * limit reclaim to prevent infinite loops, if they ever occur.
 423  */
 424 #define MEM_CGROUP_MAX_RECLAIM_LOOPS            100
 425
 426 enum charge_type {
 427         MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 428         MEM_CGROUP_CHARGE_TYPE_ANON,
 429         MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
 430         MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
 431         NR_CHARGE_TYPE,
 432 };
 433
 434 /* for encoding cft->private value on file */
 435 enum res_type {
 436         _MEM,
 437         _MEMSWAP,
 438         _OOM_TYPE,
 439         _KMEM,
 440 };
 441
 442 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
 443 #define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
 444 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
 445 /* Used for OOM nofiier */
 446 #define OOM_CONTROL             (0)
 447
 448 /*
 449  * Reclaim flags for mem_cgroup_hierarchical_reclaim
 450  */
 451 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT   0x0
 452 #define MEM_CGROUP_RECLAIM_NOSWAP       (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
 453 #define MEM_CGROUP_RECLAIM_SHRINK_BIT   0x1
 454 #define MEM_CGROUP_RECLAIM_SHRINK       (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
 455
 456 /*
 457  * The memcg_create_mutex will be held whenever a new cgroup is created.
 458  * As a consequence, any change that needs to protect against new child cgroups
 459  * appearing has to hold it as well.
 460  */
 461 static DEFINE_MUTEX(memcg_create_mutex);
 462
 463 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
 464 {
 465         return s ? container_of(s, struct mem_cgroup, css) : NULL;
 466 }
 467
 468 /* Some nice accessors for the vmpressure. */
 469 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 470 {
 471         if (!memcg)
 472                 memcg = root_mem_cgroup;
 473         return &memcg->vmpressure;
 474 }
 475
 476 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 477 {
 478         return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 479 }
 480
 481 struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
 482 {
 483         return &mem_cgroup_from_css(css)->vmpressure;
 484 }
 485
 486 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 487 {
 488         return (memcg == root_mem_cgroup);
 489 }
 490
 491 /*
 492  * We restrict the id in the range of [1, 65535], so it can fit into
 493  * an unsigned short.
 494  */
 495 #define MEM_CGROUP_ID_MAX       USHRT_MAX
 496
 497 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 498 {
 499         /*
 500          * The ID of the root cgroup is 0, but memcg treat 0 as an
 501          * invalid ID, so we return (cgroup_id + 1).
 502          */
 503         return memcg->css.cgroup->id + 1;
 504 }
 505
 506 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 507 {
 508         struct cgroup_subsys_state *css;
 509
 510         css = css_from_id(id - 1, &mem_cgroup_subsys);
 511         return mem_cgroup_from_css(css);
 512 }
 513
 514 /* Writing them here to avoid exposing memcg's inner layout */
 515 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 516
 517 void sock_update_memcg(struct sock *sk)
 518 {
 519         if (mem_cgroup_sockets_enabled) {
 520                 struct mem_cgroup *memcg;
 521                 struct cg_proto *cg_proto;
 522
 523                 BUG_ON(!sk->sk_prot->proto_cgroup);
 524
 525                 /* Socket cloning can throw us here with sk_cgrp already
 526                  * filled. It won't however, necessarily happen from
 527                  * process context. So the test for root memcg given
 528                  * the current task's memcg won't help us in this case.
 529                  *
 530                  * Respecting the original socket's memcg is a better
 531                  * decision in this case.
 532                  */
 533                 if (sk->sk_cgrp) {
 534                         BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
 535                         css_get(&sk->sk_cgrp->memcg->css);
 536                         return;
 537                 }
 538
 539                 rcu_read_lock();
 540                 memcg = mem_cgroup_from_task(current);
 541                 cg_proto = sk->sk_prot->proto_cgroup(memcg);
 542                 if (!mem_cgroup_is_root(memcg) &&
 543                     memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
 544                         sk->sk_cgrp = cg_proto;
 545                 }
 546                 rcu_read_unlock();
 547         }
 548 }
 549 EXPORT_SYMBOL(sock_update_memcg);
 550
 551 void sock_release_memcg(struct sock *sk)
 552 {
 553         if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
 554                 struct mem_cgroup *memcg;
 555                 WARN_ON(!sk->sk_cgrp->memcg);
 556                 memcg = sk->sk_cgrp->memcg;
 557                 css_put(&sk->sk_cgrp->memcg->css);
 558         }
 559 }
 560
 561 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 562 {
 563         if (!memcg || mem_cgroup_is_root(memcg))
 564                 return NULL;
 565
 566         return &memcg->tcp_mem.cg_proto;
 567 }
 568 EXPORT_SYMBOL(tcp_proto_cgroup);
 569
 570 static void disarm_sock_keys(struct mem_cgroup *memcg)
 571 {
 572         if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
 573                 return;
 574         static_key_slow_dec(&memcg_socket_limit_enabled);
 575 }
 576 #else
 577 static void disarm_sock_keys(struct mem_cgroup *memcg)
 578 {
 579 }
 580 #endif
 581
 582 #ifdef CONFIG_MEMCG_KMEM
 583 /*
 584  * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
 585  * There are two main reasons for not using the css_id for this:
 586  *  1) this works better in sparse environments, where we have a lot of memcgs,
 587  *     but only a few kmem-limited. Or also, if we have, for instance, 200
 588  *     memcgs, and none but the 200th is kmem-limited, we'd have to have a
 589  *     200 entry array for that.
 590  *
 591  *  2) In order not to violate the cgroup API, we would like to do all memory
 592  *     allocation in ->create(). At that point, we haven't yet allocated the
 593  *     css_id. Having a separate index prevents us from messing with the cgroup
 594  *     core for this
 595  *
 596  * The current size of the caches array is stored in
 597  * memcg_limited_groups_array_size.  It will double each time we have to
 598  * increase it.
 599  */
 600 static DEFINE_IDA(kmem_limited_groups);
 601 int memcg_limited_groups_array_size;
 602
 603 /*
 604  * MIN_SIZE is different than 1, because we would like to avoid going through
 605  * the alloc/free process all the time. In a small machine, 4 kmem-limited
 606  * cgroups is a reasonable guess. In the future, it could be a parameter or
 607  * tunable, but that is strictly not necessary.
 608  *
 609  * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
 610  * this constant directly from cgroup, but it is understandable that this is
 611  * better kept as an internal representation in cgroup.c. In any case, the
 612  * css_id space is not getting any smaller, and we don't have to necessarily
 613  * increase ours as well if it increases.
 614  */
 615 #define MEMCG_CACHES_MIN_SIZE 4
 616 #define MEMCG_CACHES_MAX_SIZE 65535
 617
 618 /*
 619  * A lot of the calls to the cache allocation functions are expected to be
 620  * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 621  * conditional to this static branch, we'll have to allow modules that does
 622  * kmem_cache_alloc and the such to see this symbol as well
 623  */
 624 struct static_key memcg_kmem_enabled_key;
 625 EXPORT_SYMBOL(memcg_kmem_enabled_key);
 626
 627 static void disarm_kmem_keys(struct mem_cgroup *memcg)
 628 {
 629         if (memcg_kmem_is_active(memcg)) {
 630                 static_key_slow_dec(&memcg_kmem_enabled_key);
 631                 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
 632         }
 633         /*
 634          * This check can't live in kmem destruction function,
 635          * since the charges will outlive the cgroup
 636          */
 637         WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
 638 }
 639 #else
 640 static void disarm_kmem_keys(struct mem_cgroup *memcg)
 641 {
 642 }
 643 #endif /* CONFIG_MEMCG_KMEM */
 644
 645 static void disarm_static_keys(struct mem_cgroup *memcg)
 646 {
 647         disarm_sock_keys(memcg);
 648         disarm_kmem_keys(memcg);
 649 }
 650
 651 static void drain_all_stock_async(struct mem_cgroup *memcg);
 652
 653 static struct mem_cgroup_per_zone *
 654 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
 655 {
 656         VM_BUG_ON((unsigned)nid >= nr_node_ids);
 657         return &memcg->nodeinfo[nid]->zoneinfo[zid];
 658 }
 659
 660 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
 661 {
 662         return &memcg->css;
 663 }
 664
 665 static struct mem_cgroup_per_zone *
 666 page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 667 {
 668         int nid = page_to_nid(page);
 669         int zid = page_zonenum(page);
 670
 671         return mem_cgroup_zoneinfo(memcg, nid, zid);
 672 }
 673
 674 /*
 675  * Implementation Note: reading percpu statistics for memcg.
 676  *
 677  * Both of vmstat[] and percpu_counter has threshold and do periodic
 678  * synchronization to implement "quick" read. There are trade-off between
 679  * reading cost and precision of value. Then, we may have a chance to implement
 680  * a periodic synchronizion of counter in memcg's counter.
 681  *
 682  * But this _read() function is used for user interface now. The user accounts
 683  * memory usage by memory cgroup and he _always_ requires exact value because
 684  * he accounts memory. Even if we provide quick-and-fuzzy read, we always
 685  * have to visit all online cpus and make sum. So, for now, unnecessary
 686  * synchronization is not implemented. (just implemented for cpu hotplug)
 687  *
 688  * If there are kernel internal actions which can make use of some not-exact
 689  * value, and reading all cpu value can be performance bottleneck in some
 690  * common workload, threashold and synchonization as vmstat[] should be
 691  * implemented.
 692  */
 693 static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
 694                                  enum mem_cgroup_stat_index idx)
 695 {
 696         long val = 0;
 697         int cpu;
 698
 699         get_online_cpus();
 700         for_each_online_cpu(cpu)
 701                 val += per_cpu(memcg->stat->count[idx], cpu);
 702 #ifdef CONFIG_HOTPLUG_CPU
 703         spin_lock(&memcg->pcp_counter_lock);
 704         val += memcg->nocpu_base.count[idx];
 705         spin_unlock(&memcg->pcp_counter_lock);
 706 #endif
 707         put_online_cpus();
 708         return val;
 709 }
 710
 711 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
 712                                          bool charge)
 713 {
 714         int val = (charge) ? 1 : -1;
 715         this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
 716 }
 717
 718 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 719                                             enum mem_cgroup_events_index idx)
 720 {
 721         unsigned long val = 0;
 722         int cpu;
 723
 724         for_each_online_cpu(cpu)
 725                 val += per_cpu(memcg->stat->events[idx], cpu);
 726 #ifdef CONFIG_HOTPLUG_CPU
 727         spin_lock(&memcg->pcp_counter_lock);
 728         val += memcg->nocpu_base.events[idx];
 729         spin_unlock(&memcg->pcp_counter_lock);
 730 #endif
 731         return val;
 732 }
 733
 734 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 735                                          struct page *page,
 736                                          bool anon, int nr_pages)
 737 {
 738         preempt_disable();
 739
 740         /*
 741          * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
 742          * counted as CACHE even if it's on ANON LRU.
 743          */
 744         if (anon)
 745                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
 746                                 nr_pages);
 747         else
 748                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
 749                                 nr_pages);
 750
 751         if (PageTransHuge(page))
 752                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
 753                                 nr_pages);
 754
 755         /* pagein of a big page is an event. So, ignore page size */
 756         if (nr_pages > 0)
 757                 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
 758         else {
 759                 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 760                 nr_pages = -nr_pages; /* for event */
 761         }
 762
 763         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 764
 765         preempt_enable();
 766 }
 767
 768 unsigned long
 769 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 770 {
 771         struct mem_cgroup_per_zone *mz;
 772
 773         mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
 774         return mz->lru_size[lru];
 775 }
 776
 777 static unsigned long
 778 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
 779                         unsigned int lru_mask)
 780 {
 781         struct mem_cgroup_per_zone *mz;
 782         enum lru_list lru;
 783         unsigned long ret = 0;
 784
 785         mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 786
 787         for_each_lru(lru) {
 788                 if (BIT(lru) & lru_mask)
 789                         ret += mz->lru_size[lru];
 790         }
 791         return ret;
 792 }
 793
 794 static unsigned long
 795 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 796                         int nid, unsigned int lru_mask)
 797 {
 798         u64 total = 0;
 799         int zid;
 800
 801         for (zid = 0; zid < MAX_NR_ZONES; zid++)
 802                 total += mem_cgroup_zone_nr_lru_pages(memcg,
 803                                                 nid, zid, lru_mask);
 804
 805         return total;
 806 }
 807
 808 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
 809                         unsigned int lru_mask)
 810 {
 811         int nid;
 812         u64 total = 0;
 813
 814         for_each_node_state(nid, N_MEMORY)
 815                 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
 816         return total;
 817 }
 818
 819 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 820                                        enum mem_cgroup_events_target target)
 821 {
 822         unsigned long val, next;
 823
 824         val = __this_cpu_read(memcg->stat->nr_page_events);
 825         next = __this_cpu_read(memcg->stat->targets[target]);
 826         /* from time_after() in jiffies.h */
 827         if ((long)next - (long)val < 0) {
 828                 switch (target) {
 829                 case MEM_CGROUP_TARGET_THRESH:
 830                         next = val + THRESHOLDS_EVENTS_TARGET;
 831                         break;
 832                 case MEM_CGROUP_TARGET_SOFTLIMIT:
 833                         next = val + SOFTLIMIT_EVENTS_TARGET;
 834                         break;
 835                 case MEM_CGROUP_TARGET_NUMAINFO:
 836                         next = val + NUMAINFO_EVENTS_TARGET;
 837                         break;
 838                 default:
 839                         break;
 840                 }
 841                 __this_cpu_write(memcg->stat->targets[target], next);
 842                 return true;
 843         }
 844         return false;
 845 }
 846
 847 /*
 848  * Called from rate-limited memcg_check_events when enough
 849  * MEM_CGROUP_TARGET_SOFTLIMIT events are accumulated and it makes sure
 850  * that all the parents up the hierarchy will be notified that this group
 851  * is in excess or that it is not in excess anymore. mmecg->soft_contributed
 852  * makes the transition a single action whenever the state flips from one to
 853  * the other.
 854  */
 855 static void mem_cgroup_update_soft_limit(struct mem_cgroup *memcg)
 856 {
 857         unsigned long long excess = res_counter_soft_limit_excess(&memcg->res);
 858         struct mem_cgroup *parent = memcg;
 859         int delta = 0;
 860
 861         spin_lock(&memcg->soft_lock);
 862         if (excess) {
 863                 if (!memcg->soft_contributed) {
 864                         delta = 1;
 865                         memcg->soft_contributed = true;
 866                 }
 867         } else {
 868                 if (memcg->soft_contributed) {
 869                         delta = -1;
 870                         memcg->soft_contributed = false;
 871                 }
 872         }
 873
 874         /*
 875          * Necessary to update all ancestors when hierarchy is used
 876          * because their event counter is not touched.
 877          * We track children even outside the hierarchy for the root
 878          * cgroup because tree walk starting at root should visit
 879          * all cgroups and we want to prevent from pointless tree
 880          * walk if no children is below the limit.
 881          */
 882         while (delta && (parent = parent_mem_cgroup(parent)))
 883                 atomic_add(delta, &parent->children_in_excess);
 884         if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
 885                 atomic_add(delta, &root_mem_cgroup->children_in_excess);
 886         spin_unlock(&memcg->soft_lock);
 887 }
 888
 889 /*
 890  * Check events in order.
 891  *
 892  */
 893 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 894 {
 895         preempt_disable();
 896         /* threshold event is triggered in finer grain than soft limit */
 897         if (unlikely(mem_cgroup_event_ratelimit(memcg,
 898                                                 MEM_CGROUP_TARGET_THRESH))) {
 899                 bool do_softlimit;
 900                 bool do_numainfo __maybe_unused;
 901
 902                 do_softlimit = mem_cgroup_event_ratelimit(memcg,
 903                                                 MEM_CGROUP_TARGET_SOFTLIMIT);
 904 #if MAX_NUMNODES > 1
 905                 do_numainfo = mem_cgroup_event_ratelimit(memcg,
 906                                                 MEM_CGROUP_TARGET_NUMAINFO);
 907 #endif
 908                 preempt_enable();
 909
 910                 mem_cgroup_threshold(memcg);
 911                 if (unlikely(do_softlimit))
 912                         mem_cgroup_update_soft_limit(memcg);
 913 #if MAX_NUMNODES > 1
 914                 if (unlikely(do_numainfo))
 915                         atomic_inc(&memcg->numainfo_events);
 916 #endif
 917         } else
 918                 preempt_enable();
 919 }
 920
 921 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 922 {
 923         /*
 924          * mm_update_next_owner() may clear mm->owner to NULL
 925          * if it races with swapoff, page migration, etc.
 926          * So this can be called with p == NULL.
 927          */
 928         if (unlikely(!p))
 929                 return NULL;
 930
 931         return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id));
 932 }
 933
 934 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 935 {
 936         struct mem_cgroup *memcg = NULL;
 937
 938         if (!mm)
 939                 return NULL;
 940         /*
 941          * Because we have no locks, mm->owner's may be being moved to other
 942          * cgroup. We use css_tryget() here even if this looks
 943          * pessimistic (rather than adding locks here).
 944          */
 945         rcu_read_lock();
 946         do {
 947                 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 948                 if (unlikely(!memcg))
 949                         break;
 950         } while (!css_tryget(&memcg->css));
 951         rcu_read_unlock();
 952         return memcg;
 953 }
 954
 955 static enum mem_cgroup_filter_t
 956 mem_cgroup_filter(struct mem_cgroup *memcg, struct mem_cgroup *root,
 957                 mem_cgroup_iter_filter cond)
 958 {
 959         if (!cond)
 960                 return VISIT;
 961         return cond(memcg, root);
 962 }
 963
 964 /*
 965  * Returns a next (in a pre-order walk) alive memcg (with elevated css
 966  * ref. count) or NULL if the whole root's subtree has been visited.
 967  *
 968  * helper function to be used by mem_cgroup_iter
 969  */
 970 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
 971                 struct mem_cgroup *last_visited, mem_cgroup_iter_filter cond)
 972 {
 973         struct cgroup_subsys_state *prev_css, *next_css;
 974
 975         prev_css = last_visited ? &last_visited->css : NULL;
 976 skip_node:
 977         next_css = css_next_descendant_pre(prev_css, &root->css);
 978
 979         /*
 980          * Even if we found a group we have to make sure it is
 981          * alive. css && !memcg means that the groups should be
 982          * skipped and we should continue the tree walk.
 983          * last_visited css is safe to use because it is
 984          * protected by css_get and the tree walk is rcu safe.
 985          */
 986         if (next_css) {
 987                 struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
 988
 989                 switch (mem_cgroup_filter(mem, root, cond)) {
 990                 case SKIP:
 991                         prev_css = next_css;
 992                         goto skip_node;
 993                 case SKIP_TREE:
 994                         if (mem == root)
 995                                 return NULL;
 996                         /*
 997                          * css_rightmost_descendant is not an optimal way to
 998                          * skip through a subtree (especially for imbalanced
 999                          * trees leaning to right) but that's what we have right
1000                          * now. More effective solution would be traversing
1001                          * right-up for first non-NULL without calling
1002                          * css_next_descendant_pre afterwards.
1003                          */
1004                         prev_css = css_rightmost_descendant(next_css);
1005                         goto skip_node;
1006                 case VISIT:
1007                         if (css_tryget(&mem->css))
1008                                 return mem;
1009                         else {
1010                                 prev_css = next_css;
1011                                 goto skip_node;
1012                         }
1013                         break;
1014                 }
1015         }
1016
1017         return NULL;
1018 }
1019
1020 static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
1021 {
1022         /*
1023          * When a group in the hierarchy below root is destroyed, the
1024          * hierarchy iterator can no longer be trusted since it might
1025          * have pointed to the destroyed group.  Invalidate it.
1026          */
1027         atomic_inc(&root->dead_count);
1028 }
1029
1030 static struct mem_cgroup *
1031 mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1032                      struct mem_cgroup *root,
1033                      int *sequence)
1034 {
1035         struct mem_cgroup *position = NULL;
1036         /*
1037          * A cgroup destruction happens in two stages: offlining and
1038          * release.  They are separated by a RCU grace period.
1039          *
1040          * If the iterator is valid, we may still race with an
1041          * offlining.  The RCU lock ensures the object won't be
1042          * released, tryget will fail if we lost the race.
1043          */
1044         *sequence = atomic_read(&root->dead_count);
1045         if (iter->last_dead_count == *sequence) {
1046                 smp_rmb();
1047                 position = iter->last_visited;
1048                 if (position && !css_tryget(&position->css))
1049                         position = NULL;
1050         }
1051         return position;
1052 }
1053
1054 static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1055                                    struct mem_cgroup *last_visited,
1056                                    struct mem_cgroup *new_position,
1057                                    int sequence)
1058 {
1059         if (last_visited)
1060                 css_put(&last_visited->css);
1061         /*
1062          * We store the sequence count from the time @last_visited was
1063          * loaded successfully instead of rereading it here so that we
1064          * don't lose destruction events in between.  We could have
1065          * raced with the destruction of @new_position after all.
1066          */
1067         iter->last_visited = new_position;
1068         smp_wmb();
1069         iter->last_dead_count = sequence;
1070 }
1071
1072 /**
1073  * mem_cgroup_iter - iterate over memory cgroup hierarchy
1074  * @root: hierarchy root
1075  * @prev: previously returned memcg, NULL on first invocation
1076  * @reclaim: cookie for shared reclaim walks, NULL for full walks
1077  * @cond: filter for visited nodes, NULL for no filter
1078  *
1079  * Returns references to children of the hierarchy below @root, or
1080  * @root itself, or %NULL after a full round-trip.
1081  *
1082  * Caller must pass the return value in @prev on subsequent
1083  * invocations for reference counting, or use mem_cgroup_iter_break()
1084  * to cancel a hierarchy walk before the round-trip is complete.
1085  *
1086  * Reclaimers can specify a zone and a priority level in @reclaim to
1087  * divide up the memcgs in the hierarchy among all concurrent
1088  * reclaimers operating on the same zone and priority.
1089  */
1090 struct mem_cgroup *mem_cgroup_iter_cond(struct mem_cgroup *root,
1091                                    struct mem_cgroup *prev,
1092                                    struct mem_cgroup_reclaim_cookie *reclaim,
1093                                    mem_cgroup_iter_filter cond)
1094 {
1095         struct mem_cgroup *memcg = NULL;
1096         struct mem_cgroup *last_visited = NULL;
1097
1098         if (mem_cgroup_disabled()) {
1099                 /* first call must return non-NULL, second return NULL */
1100                 return (struct mem_cgroup *)(unsigned long)!prev;
1101         }
1102
1103         if (!root)
1104                 root = root_mem_cgroup;
1105
1106         if (prev && !reclaim)
1107                 last_visited = prev;
1108
1109         if (!root->use_hierarchy && root != root_mem_cgroup) {
1110                 if (prev)
1111                         goto out_css_put;
1112                 if (mem_cgroup_filter(root, root, cond) == VISIT)
1113                         return root;
1114                 return NULL;
1115         }
1116
1117         rcu_read_lock();
1118         while (!memcg) {
1119                 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1120                 int uninitialized_var(seq);
1121
1122                 if (reclaim) {
1123                         int nid = zone_to_nid(reclaim->zone);
1124                         int zid = zone_idx(reclaim->zone);
1125                         struct mem_cgroup_per_zone *mz;
1126
1127                         mz = mem_cgroup_zoneinfo(root, nid, zid);
1128                         iter = &mz->reclaim_iter[reclaim->priority];
1129                         if (prev && reclaim->generation != iter->generation) {
1130                                 iter->last_visited = NULL;
1131                                 goto out_unlock;
1132                         }
1133
1134                         last_visited = mem_cgroup_iter_load(iter, root, &seq);
1135                 }
1136
1137                 memcg = __mem_cgroup_iter_next(root, last_visited, cond);
1138
1139                 if (reclaim) {
1140                         mem_cgroup_iter_update(iter, last_visited, memcg, seq);
1141
1142                         if (!memcg)
1143                                 iter->generation++;
1144                         else if (!prev && memcg)
1145                                 reclaim->generation = iter->generation;
1146                 }
1147
1148                 /*
1149                  * We have finished the whole tree walk or no group has been
1150                  * visited because filter told us to skip the root node.
1151                  */
1152                 if (!memcg && (prev || (cond && !last_visited)))
1153                         goto out_unlock;
1154         }
1155 out_unlock:
1156         rcu_read_unlock();
1157 out_css_put:
1158         if (prev && prev != root)
1159                 css_put(&prev->css);
1160
1161         return memcg;
1162 }
1163
1164 /**
1165  * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1166  * @root: hierarchy root
1167  * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1168  */
1169 void mem_cgroup_iter_break(struct mem_cgroup *root,
1170                            struct mem_cgroup *prev)
1171 {
1172         if (!root)
1173                 root = root_mem_cgroup;
1174         if (prev && prev != root)
1175                 css_put(&prev->css);
1176 }
1177
1178 /*
1179  * Iteration constructs for visiting all cgroups (under a tree).  If
1180  * loops are exited prematurely (break), mem_cgroup_iter_break() must
1181  * be used for reference counting.
1182  */
1183 #define for_each_mem_cgroup_tree(iter, root)            \
1184         for (iter = mem_cgroup_iter(root, NULL, NULL);  \
1185              iter != NULL;                              \
1186              iter = mem_cgroup_iter(root, iter, NULL))
1187
1188 #define for_each_mem_cgroup(iter)                       \
1189         for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
1190              iter != NULL;                              \
1191              iter = mem_cgroup_iter(NULL, iter, NULL))
1192
1193 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1194 {
1195         struct mem_cgroup *memcg;
1196
1197         rcu_read_lock();
1198         memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1199         if (unlikely(!memcg))
1200                 goto out;
1201
1202         switch (idx) {
1203         case PGFAULT:
1204                 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
1205                 break;
1206         case PGMAJFAULT:
1207                 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
1208                 break;
1209         default:
1210                 BUG();
1211         }
1212 out:
1213         rcu_read_unlock();
1214 }
1215 EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
1216
1217 /**
1218  * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
1219  * @zone: zone of the wanted lruvec
1220  * @memcg: memcg of the wanted lruvec
1221  *
1222  * Returns the lru list vector holding pages for the given @zone and
1223  * @mem.  This can be the global zone lruvec, if the memory controller
1224  * is disabled.
1225  */
1226 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1227                                       struct mem_cgroup *memcg)
1228 {
1229         struct mem_cgroup_per_zone *mz;
1230         struct lruvec *lruvec;
1231
1232         if (mem_cgroup_disabled()) {
1233                 lruvec = &zone->lruvec;
1234                 goto out;
1235         }
1236
1237         mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1238         lruvec = &mz->lruvec;
1239 out:
1240         /*
1241          * Since a node can be onlined after the mem_cgroup was created,
1242          * we have to be prepared to initialize lruvec->zone here;
1243          * and if offlined then reonlined, we need to reinitialize it.
1244          */
1245         if (unlikely(lruvec->zone != zone))
1246                 lruvec->zone = zone;
1247         return lruvec;
1248 }
1249
1250 /*
1251  * Following LRU functions are allowed to be used without PCG_LOCK.
1252  * Operations are called by routine of global LRU independently from memcg.
1253  * What we have to take care of here is validness of pc->mem_cgroup.
1254  *
1255  * Changes to pc->mem_cgroup happens when
1256  * 1. charge
1257  * 2. moving account
1258  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
1259  * It is added to LRU before charge.
1260  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
1261  * When moving account, the page is not on LRU. It's isolated.
1262  */
1263
1264 /**
1265  * mem_cgroup_page_lruvec - return lruvec for adding an lru page
1266  * @page: the page
1267  * @zone: zone of the page
1268  */
1269 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1270 {
1271         struct mem_cgroup_per_zone *mz;
1272         struct mem_cgroup *memcg;
1273         struct page_cgroup *pc;
1274         struct lruvec *lruvec;
1275
1276         if (mem_cgroup_disabled()) {
1277                 lruvec = &zone->lruvec;
1278                 goto out;
1279         }
1280
1281         pc = lookup_page_cgroup(page);
1282         memcg = pc->mem_cgroup;
1283
1284         /*
1285          * Surreptitiously switch any uncharged offlist page to root:
1286          * an uncharged page off lru does nothing to secure
1287          * its former mem_cgroup from sudden removal.
1288          *
1289          * Our caller holds lru_lock, and PageCgroupUsed is updated
1290          * under page_cgroup lock: between them, they make all uses
1291          * of pc->mem_cgroup safe.
1292          */
1293         if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1294                 pc->mem_cgroup = memcg = root_mem_cgroup;
1295
1296         mz = page_cgroup_zoneinfo(memcg, page);
1297         lruvec = &mz->lruvec;
1298 out:
1299         /*
1300          * Since a node can be onlined after the mem_cgroup was created,
1301          * we have to be prepared to initialize lruvec->zone here;
1302          * and if offlined then reonlined, we need to reinitialize it.
1303          */
1304         if (unlikely(lruvec->zone != zone))
1305                 lruvec->zone = zone;
1306         return lruvec;
1307 }
1308
1309 /**
1310  * mem_cgroup_update_lru_size - account for adding or removing an lru page
1311  * @lruvec: mem_cgroup per zone lru vector
1312  * @lru: index of lru list the page is sitting on
1313  * @nr_pages: positive when adding or negative when removing
1314  *
1315  * This function must be called when a page is added to or removed from an
1316  * lru list.
1317  */
1318 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1319                                 int nr_pages)
1320 {
1321         struct mem_cgroup_per_zone *mz;
1322         unsigned long *lru_size;
1323
1324         if (mem_cgroup_disabled())
1325                 return;
1326
1327         mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1328         lru_size = mz->lru_size + lru;
1329         *lru_size += nr_pages;
1330         VM_BUG_ON((long)(*lru_size) < 0);
1331 }
1332
1333 /*
1334  * Checks whether given mem is same or in the root_mem_cgroup's
1335  * hierarchy subtree
1336  */
1337 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1338                                   struct mem_cgroup *memcg)
1339 {
1340         if (root_memcg == memcg)
1341                 return true;
1342         if (!root_memcg->use_hierarchy || !memcg)
1343                 return false;
1344         return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
1345 }
1346
1347 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1348                                        struct mem_cgroup *memcg)
1349 {
1350         bool ret;
1351
1352         rcu_read_lock();
1353         ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1354         rcu_read_unlock();
1355         return ret;
1356 }
1357
1358 bool task_in_mem_cgroup(struct task_struct *task,
1359                         const struct mem_cgroup *memcg)
1360 {
1361         struct mem_cgroup *curr = NULL;
1362         struct task_struct *p;
1363         bool ret;
1364
1365         p = find_lock_task_mm(task);
1366         if (p) {
1367                 curr = try_get_mem_cgroup_from_mm(p->mm);
1368                 task_unlock(p);
1369         } else {
1370                 /*
1371                  * All threads may have already detached their mm's, but the oom
1372                  * killer still needs to detect if they have already been oom
1373                  * killed to prevent needlessly killing additional tasks.
1374                  */
1375                 rcu_read_lock();
1376                 curr = mem_cgroup_from_task(task);
1377                 if (curr)
1378                         css_get(&curr->css);
1379                 rcu_read_unlock();
1380         }
1381         if (!curr)
1382                 return false;
1383         /*
1384          * We should check use_hierarchy of "memcg" not "curr". Because checking
1385          * use_hierarchy of "curr" here make this function true if hierarchy is
1386          * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
1387          * hierarchy(even if use_hierarchy is disabled in "memcg").
1388          */
1389         ret = mem_cgroup_same_or_subtree(memcg, curr);
1390         css_put(&curr->css);
1391         return ret;
1392 }
1393
1394 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1395 {
1396         unsigned long inactive_ratio;
1397         unsigned long inactive;
1398         unsigned long active;
1399         unsigned long gb;
1400
1401         inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1402         active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1403
1404         gb = (inactive + active) >> (30 - PAGE_SHIFT);
1405         if (gb)
1406                 inactive_ratio = int_sqrt(10 * gb);
1407         else
1408                 inactive_ratio = 1;
1409
1410         return inactive * inactive_ratio < active;
1411 }
1412
1413 #define mem_cgroup_from_res_counter(counter, member)    \
1414         container_of(counter, struct mem_cgroup, member)
1415
1416 /**
1417  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1418  * @memcg: the memory cgroup
1419  *
1420  * Returns the maximum amount of memory @mem can be charged with, in
1421  * pages.
1422  */
1423 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1424 {
1425         unsigned long long margin;
1426
1427         margin = res_counter_margin(&memcg->res);
1428         if (do_swap_account)
1429                 margin = min(margin, res_counter_margin(&memcg->memsw));
1430         return margin >> PAGE_SHIFT;
1431 }
1432
1433 int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1434 {
1435         /* root ? */
1436         if (!css_parent(&memcg->css))
1437                 return vm_swappiness;
1438
1439         return memcg->swappiness;
1440 }
1441
1442 /*
1443  * memcg->moving_account is used for checking possibility that some thread is
1444  * calling move_account(). When a thread on CPU-A starts moving pages under
1445  * a memcg, other threads should check memcg->moving_account under
1446  * rcu_read_lock(), like this:
1447  *
1448  *         CPU-A                                    CPU-B
1449  *                                              rcu_read_lock()
1450  *         memcg->moving_account+1              if (memcg->mocing_account)
1451  *                                                   take heavy locks.
1452  *         synchronize_rcu()                    update something.
1453  *                                              rcu_read_unlock()
1454  *         start move here.
1455  */
1456
1457 /* for quick checking without looking up memcg */
1458 atomic_t memcg_moving __read_mostly;
1459
1460 static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1461 {
1462         atomic_inc(&memcg_moving);
1463         atomic_inc(&memcg->moving_account);
1464         synchronize_rcu();
1465 }
1466
1467 static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1468 {
1469         /*
1470          * Now, mem_cgroup_clear_mc() may call this function with NULL.
1471          * We check NULL in callee rather than caller.
1472          */
1473         if (memcg) {
1474                 atomic_dec(&memcg_moving);
1475                 atomic_dec(&memcg->moving_account);
1476         }
1477 }
1478
1479 /*
1480  * 2 routines for checking "mem" is under move_account() or not.
1481  *
1482  * mem_cgroup_stolen() -  checking whether a cgroup is mc.from or not. This
1483  *                        is used for avoiding races in accounting.  If true,
1484  *                        pc->mem_cgroup may be overwritten.
1485  *
1486  * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1487  *                        under hierarchy of moving cgroups. This is for
1488  *                        waiting at hith-memory prressure caused by "move".
1489  */
1490
1491 static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1492 {
1493         VM_BUG_ON(!rcu_read_lock_held());
1494         return atomic_read(&memcg->moving_account) > 0;
1495 }
1496
1497 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1498 {
1499         struct mem_cgroup *from;
1500         struct mem_cgroup *to;
1501         bool ret = false;
1502         /*
1503          * Unlike task_move routines, we access mc.to, mc.from not under
1504          * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1505          */
1506         spin_lock(&mc.lock);
1507         from = mc.from;
1508         to = mc.to;
1509         if (!from)
1510                 goto unlock;
1511
1512         ret = mem_cgroup_same_or_subtree(memcg, from)
1513                 || mem_cgroup_same_or_subtree(memcg, to);
1514 unlock:
1515         spin_unlock(&mc.lock);
1516         return ret;
1517 }
1518
1519 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1520 {
1521         if (mc.moving_task && current != mc.moving_task) {
1522                 if (mem_cgroup_under_move(memcg)) {
1523                         DEFINE_WAIT(wait);
1524                         prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1525                         /* moving charge context might have finished. */
1526                         if (mc.moving_task)
1527                                 schedule();
1528                         finish_wait(&mc.waitq, &wait);
1529                         return true;
1530                 }
1531         }
1532         return false;
1533 }
1534
1535 /*
1536  * Take this lock when
1537  * - a code tries to modify page's memcg while it's USED.
1538  * - a code tries to modify page state accounting in a memcg.
1539  * see mem_cgroup_stolen(), too.
1540  */
1541 static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1542                                   unsigned long *flags)
1543 {
1544         spin_lock_irqsave(&memcg->move_lock, *flags);
1545 }
1546
1547 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1548                                 unsigned long *flags)
1549 {
1550         spin_unlock_irqrestore(&memcg->move_lock, *flags);
1551 }
1552
1553 #define K(x) ((x) << (PAGE_SHIFT-10))
1554 /**
1555  * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1556  * @memcg: The memory cgroup that went over limit
1557  * @p: Task that is going to be killed
1558  *
1559  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1560  * enabled
1561  */
1562 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1563 {
1564         struct cgroup *task_cgrp;
1565         struct cgroup *mem_cgrp;
1566         /*
1567          * Need a buffer in BSS, can't rely on allocations. The code relies
1568          * on the assumption that OOM is serialized for memory controller.
1569          * If this assumption is broken, revisit this code.
1570          */
1571         static char memcg_name[PATH_MAX];
1572         int ret;
1573         struct mem_cgroup *iter;
1574         unsigned int i;
1575
1576         if (!p)
1577                 return;
1578
1579         rcu_read_lock();
1580
1581         mem_cgrp = memcg->css.cgroup;
1582         task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1583
1584         ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1585         if (ret < 0) {
1586                 /*
1587                  * Unfortunately, we are unable to convert to a useful name
1588                  * But we'll still print out the usage information
1589                  */
1590                 rcu_read_unlock();
1591                 goto done;
1592         }
1593         rcu_read_unlock();
1594
1595         pr_info("Task in %s killed", memcg_name);
1596
1597         rcu_read_lock();
1598         ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1599         if (ret < 0) {
1600                 rcu_read_unlock();
1601                 goto done;
1602         }
1603         rcu_read_unlock();
1604
1605         /*
1606          * Continues from above, so we don't need an KERN_ level
1607          */
1608         pr_cont(" as a result of limit of %s\n", memcg_name);
1609 done:
1610
1611         pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
1612                 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1613                 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1614                 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1615         pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
1616                 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1617                 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1618                 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1619         pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1620                 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1621                 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1622                 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1623
1624         for_each_mem_cgroup_tree(iter, memcg) {
1625                 pr_info("Memory cgroup stats");
1626
1627                 rcu_read_lock();
1628                 ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
1629                 if (!ret)
1630                         pr_cont(" for %s", memcg_name);
1631                 rcu_read_unlock();
1632                 pr_cont(":");
1633
1634                 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1635                         if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1636                                 continue;
1637                         pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
1638                                 K(mem_cgroup_read_stat(iter, i)));
1639                 }
1640
1641                 for (i = 0; i < NR_LRU_LISTS; i++)
1642                         pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1643                                 K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1644
1645                 pr_cont("\n");
1646         }
1647 }
1648
1649 /*
1650  * This function returns the number of memcg under hierarchy tree. Returns
1651  * 1(self count) if no children.
1652  */
1653 static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1654 {
1655         int num = 0;
1656         struct mem_cgroup *iter;
1657
1658         for_each_mem_cgroup_tree(iter, memcg)
1659                 num++;
1660         return num;
1661 }
1662
1663 /*
1664  * Return the memory (and swap, if configured) limit for a memcg.
1665  */
1666 static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1667 {
1668         u64 limit;
1669
1670         limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1671
1672         /*
1673          * Do not consider swap space if we cannot swap due to swappiness
1674          */
1675         if (mem_cgroup_swappiness(memcg)) {
1676                 u64 memsw;
1677
1678                 limit += total_swap_pages << PAGE_SHIFT;
1679                 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1680
1681                 /*
1682                  * If memsw is finite and limits the amount of swap space
1683                  * available to this memcg, return that limit.
1684                  */
1685                 limit = min(limit, memsw);
1686         }
1687
1688         return limit;
1689 }
1690
1691 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1692                                      int order)
1693 {
1694         struct mem_cgroup *iter;
1695         unsigned long chosen_points = 0;
1696         unsigned long totalpages;
1697         unsigned int points = 0;
1698         struct task_struct *chosen = NULL;
1699
1700         /*
1701          * If current has a pending SIGKILL or is exiting, then automatically
1702          * select it.  The goal is to allow it to allocate so that it may
1703          * quickly exit and free its memory.
1704          */
1705         if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
1706                 set_thread_flag(TIF_MEMDIE);
1707                 return;
1708         }
1709
1710         check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1711         totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1712         for_each_mem_cgroup_tree(iter, memcg) {
1713                 struct css_task_iter it;
1714                 struct task_struct *task;
1715
1716                 css_task_iter_start(&iter->css, &it);
1717                 while ((task = css_task_iter_next(&it))) {
1718                         switch (oom_scan_process_thread(task, totalpages, NULL,
1719                                                         false)) {
1720                         case OOM_SCAN_SELECT:
1721                                 if (chosen)
1722                                         put_task_struct(chosen);
1723                                 chosen = task;
1724                                 chosen_points = ULONG_MAX;
1725                                 get_task_struct(chosen);
1726                                 /* fall through */
1727                         case OOM_SCAN_CONTINUE:
1728                                 continue;
1729                         case OOM_SCAN_ABORT:
1730                                 css_task_iter_end(&it);
1731                                 mem_cgroup_iter_break(memcg, iter);
1732                                 if (chosen)
1733                                         put_task_struct(chosen);
1734                                 return;
1735                         case OOM_SCAN_OK:
1736                                 break;
1737                         };
1738                         points = oom_badness(task, memcg, NULL, totalpages);
1739                         if (points > chosen_points) {
1740                                 if (chosen)
1741                                         put_task_struct(chosen);
1742                                 chosen = task;
1743                                 chosen_points = points;
1744                                 get_task_struct(chosen);
1745                         }
1746                 }
1747                 css_task_iter_end(&it);
1748         }
1749
1750         if (!chosen)
1751                 return;
1752         points = chosen_points * 1000 / totalpages;
1753         oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1754                          NULL, "Memory cgroup out of memory");
1755 }
1756
1757 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1758                                         gfp_t gfp_mask,
1759                                         unsigned long flags)
1760 {
1761         unsigned long total = 0;
1762         bool noswap = false;
1763         int loop;
1764
1765         if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1766                 noswap = true;
1767         if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1768                 noswap = true;
1769
1770         for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1771                 if (loop)
1772                         drain_all_stock_async(memcg);
1773                 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1774                 /*
1775                  * Allow limit shrinkers, which are triggered directly
1776                  * by userspace, to catch signals and stop reclaim
1777                  * after minimal progress, regardless of the margin.
1778                  */
1779                 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1780                         break;
1781                 if (mem_cgroup_margin(memcg))
1782                         break;
1783                 /*
1784                  * If nothing was reclaimed after two attempts, there
1785                  * may be no reclaimable pages in this hierarchy.
1786                  */
1787                 if (loop && !total)
1788                         break;
1789         }
1790         return total;
1791 }
1792
1793 #if MAX_NUMNODES > 1
1794 /**
1795  * test_mem_cgroup_node_reclaimable
1796  * @memcg: the target memcg
1797  * @nid: the node ID to be checked.
1798  * @noswap : specify true here if the user wants flle only information.
1799  *
1800  * This function returns whether the specified memcg contains any
1801  * reclaimable pages on a node. Returns true if there are any reclaimable
1802  * pages in the node.
1803  */
1804 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1805                 int nid, bool noswap)
1806 {
1807         if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1808                 return true;
1809         if (noswap || !total_swap_pages)
1810                 return false;
1811         if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1812                 return true;
1813         return false;
1814
1815 }
1816
1817 /*
1818  * Always updating the nodemask is not very good - even if we have an empty
1819  * list or the wrong list here, we can start from some node and traverse all
1820  * nodes based on the zonelist. So update the list loosely once per 10 secs.
1821  *
1822  */
1823 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1824 {
1825         int nid;
1826         /*
1827          * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1828          * pagein/pageout changes since the last update.
1829          */
1830         if (!atomic_read(&memcg->numainfo_events))
1831                 return;
1832         if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1833                 return;
1834
1835         /* make a nodemask where this memcg uses memory from */
1836         memcg->scan_nodes = node_states[N_MEMORY];
1837
1838         for_each_node_mask(nid, node_states[N_MEMORY]) {
1839
1840                 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1841                         node_clear(nid, memcg->scan_nodes);
1842         }
1843
1844         atomic_set(&memcg->numainfo_events, 0);
1845         atomic_set(&memcg->numainfo_updating, 0);
1846 }
1847
1848 /*
1849  * Selecting a node where we start reclaim from. Because what we need is just
1850  * reducing usage counter, start from anywhere is O,K. Considering
1851  * memory reclaim from current node, there are pros. and cons.
1852  *
1853  * Freeing memory from current node means freeing memory from a node which
1854  * we'll use or we've used. So, it may make LRU bad. And if several threads
1855  * hit limits, it will see a contention on a node. But freeing from remote
1856  * node means more costs for memory reclaim because of memory latency.
1857  *
1858  * Now, we use round-robin. Better algorithm is welcomed.
1859  */
1860 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1861 {
1862         int node;
1863
1864         mem_cgroup_may_update_nodemask(memcg);
1865         node = memcg->last_scanned_node;
1866
1867         node = next_node(node, memcg->scan_nodes);
1868         if (node == MAX_NUMNODES)
1869                 node = first_node(memcg->scan_nodes);
1870         /*
1871          * We call this when we hit limit, not when pages are added to LRU.
1872          * No LRU may hold pages because all pages are UNEVICTABLE or
1873          * memcg is too small and all pages are not on LRU. In that case,
1874          * we use curret node.
1875          */
1876         if (unlikely(node == MAX_NUMNODES))
1877                 node = numa_node_id();
1878
1879         memcg->last_scanned_node = node;
1880         return node;
1881 }
1882
1883 #else
1884 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1885 {
1886         return 0;
1887 }
1888
1889 #endif
1890
1891 /*
1892  * A group is eligible for the soft limit reclaim under the given root
1893  * hierarchy if
1894  *      a) it is over its soft limit
1895  *      b) any parent up the hierarchy is over its soft limit
1896  *
1897  * If the given group doesn't have any children over the limit then it
1898  * doesn't make any sense to iterate its subtree.
1899  */
1900 enum mem_cgroup_filter_t
1901 mem_cgroup_soft_reclaim_eligible(struct mem_cgroup *memcg,
1902                 struct mem_cgroup *root)
1903 {
1904         struct mem_cgroup *parent;
1905
1906         if (!memcg)
1907                 memcg = root_mem_cgroup;
1908         parent = memcg;
1909
1910         if (res_counter_soft_limit_excess(&memcg->res))
1911                 return VISIT;
1912
1913         /*
1914          * If any parent up to the root in the hierarchy is over its soft limit
1915          * then we have to obey and reclaim from this group as well.
1916          */
1917         while ((parent = parent_mem_cgroup(parent))) {
1918                 if (res_counter_soft_limit_excess(&parent->res))
1919                         return VISIT;
1920                 if (parent == root)
1921                         break;
1922         }
1923
1924         if (!atomic_read(&memcg->children_in_excess))
1925                 return SKIP_TREE;
1926         return SKIP;
1927 }
1928
1929 static DEFINE_SPINLOCK(memcg_oom_lock);
1930
1931 /*
1932  * Check OOM-Killer is already running under our hierarchy.
1933  * If someone is running, return false.
1934  */
1935 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1936 {
1937         struct mem_cgroup *iter, *failed = NULL;
1938
1939         spin_lock(&memcg_oom_lock);
1940
1941         for_each_mem_cgroup_tree(iter, memcg) {
1942                 if (iter->oom_lock) {
1943                         /*
1944                          * this subtree of our hierarchy is already locked
1945                          * so we cannot give a lock.
1946                          */
1947                         failed = iter;
1948                         mem_cgroup_iter_break(memcg, iter);
1949                         break;
1950                 } else
1951                         iter->oom_lock = true;
1952         }
1953
1954         if (failed) {
1955                 /*
1956                  * OK, we failed to lock the whole subtree so we have
1957                  * to clean up what we set up to the failing subtree
1958                  */
1959                 for_each_mem_cgroup_tree(iter, memcg) {
1960                         if (iter == failed) {
1961                                 mem_cgroup_iter_break(memcg, iter);
1962                                 break;
1963                         }
1964                         iter->oom_lock = false;
1965                 }
1966         }
1967
1968         spin_unlock(&memcg_oom_lock);
1969
1970         return !failed;
1971 }
1972
1973 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1974 {
1975         struct mem_cgroup *iter;
1976
1977         spin_lock(&memcg_oom_lock);
1978         for_each_mem_cgroup_tree(iter, memcg)
1979                 iter->oom_lock = false;
1980         spin_unlock(&memcg_oom_lock);
1981 }
1982
1983 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1984 {
1985         struct mem_cgroup *iter;
1986
1987         for_each_mem_cgroup_tree(iter, memcg)
1988                 atomic_inc(&iter->under_oom);
1989 }
1990
1991 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1992 {
1993         struct mem_cgroup *iter;
1994
1995         /*
1996          * When a new child is created while the hierarchy is under oom,
1997          * mem_cgroup_oom_lock() may not be called. We have to use
1998          * atomic_add_unless() here.
1999          */
2000         for_each_mem_cgroup_tree(iter, memcg)
2001                 atomic_add_unless(&iter->under_oom, -1, 0);
2002 }
2003
2004 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
2005
2006 struct oom_wait_info {
2007         struct mem_cgroup *memcg;
2008         wait_queue_t    wait;
2009 };
2010
2011 static int memcg_oom_wake_function(wait_queue_t *wait,
2012         unsigned mode, int sync, void *arg)
2013 {
2014         struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
2015         struct mem_cgroup *oom_wait_memcg;
2016         struct oom_wait_info *oom_wait_info;
2017
2018         oom_wait_info = container_of(wait, struct oom_wait_info, wait);
2019         oom_wait_memcg = oom_wait_info->memcg;
2020
2021         /*
2022          * Both of oom_wait_info->memcg and wake_memcg are stable under us.
2023          * Then we can use css_is_ancestor without taking care of RCU.
2024          */
2025         if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
2026                 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
2027                 return 0;
2028         return autoremove_wake_function(wait, mode, sync, arg);
2029 }
2030
2031 static void memcg_wakeup_oom(struct mem_cgroup *memcg)
2032 {
2033         atomic_inc(&memcg->oom_wakeups);
2034         /* for filtering, pass "memcg" as argument. */
2035         __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2036 }
2037
2038 static void memcg_oom_recover(struct mem_cgroup *memcg)
2039 {
2040         if (memcg && atomic_read(&memcg->under_oom))
2041                 memcg_wakeup_oom(memcg);
2042 }
2043
2044 /*
2045  * try to call OOM killer
2046  */
2047 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2048 {
2049         bool locked;
2050         int wakeups;
2051
2052         if (!current->memcg_oom.may_oom)
2053                 return;
2054
2055         current->memcg_oom.in_memcg_oom = 1;
2056
2057         /*
2058          * As with any blocking lock, a contender needs to start
2059          * listening for wakeups before attempting the trylock,
2060          * otherwise it can miss the wakeup from the unlock and sleep
2061          * indefinitely.  This is just open-coded because our locking
2062          * is so particular to memcg hierarchies.
2063          */
2064         wakeups = atomic_read(&memcg->oom_wakeups);
2065         mem_cgroup_mark_under_oom(memcg);
2066
2067         locked = mem_cgroup_oom_trylock(memcg);
2068
2069         if (locked)
2070                 mem_cgroup_oom_notify(memcg);
2071
2072         if (locked && !memcg->oom_kill_disable) {
2073                 mem_cgroup_unmark_under_oom(memcg);
2074                 mem_cgroup_out_of_memory(memcg, mask, order);
2075                 mem_cgroup_oom_unlock(memcg);
2076                 /*
2077                  * There is no guarantee that an OOM-lock contender
2078                  * sees the wakeups triggered by the OOM kill
2079                  * uncharges.  Wake any sleepers explicitely.
2080                  */
2081                 memcg_oom_recover(memcg);
2082         } else {
2083                 /*
2084                  * A system call can just return -ENOMEM, but if this
2085                  * is a page fault and somebody else is handling the
2086                  * OOM already, we need to sleep on the OOM waitqueue
2087                  * for this memcg until the situation is resolved.
2088                  * Which can take some time because it might be
2089                  * handled by a userspace task.
2090                  *
2091                  * However, this is the charge context, which means
2092                  * that we may sit on a large call stack and hold
2093                  * various filesystem locks, the mmap_sem etc. and we
2094                  * don't want the OOM handler to deadlock on them
2095                  * while we sit here and wait.  Store the current OOM
2096                  * context in the task_struct, then return -ENOMEM.
2097                  * At the end of the page fault handler, with the
2098                  * stack unwound, pagefault_out_of_memory() will check
2099                  * back with us by calling
2100                  * mem_cgroup_oom_synchronize(), possibly putting the
2101                  * task to sleep.
2102                  */
2103                 current->memcg_oom.oom_locked = locked;
2104                 current->memcg_oom.wakeups = wakeups;
2105                 css_get(&memcg->css);
2106                 current->memcg_oom.wait_on_memcg = memcg;
2107         }
2108 }
2109
2110 /**
2111  * mem_cgroup_oom_synchronize - complete memcg OOM handling
2112  *
2113  * This has to be called at the end of a page fault if the the memcg
2114  * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
2115  *
2116  * Memcg supports userspace OOM handling, so failed allocations must
2117  * sleep on a waitqueue until the userspace task resolves the
2118  * situation.  Sleeping directly in the charge context with all kinds
2119  * of locks held is not a good idea, instead we remember an OOM state
2120  * in the task and mem_cgroup_oom_synchronize() has to be called at
2121  * the end of the page fault to put the task to sleep and clean up the
2122  * OOM state.
2123  *
2124  * Returns %true if an ongoing memcg OOM situation was detected and
2125  * finalized, %false otherwise.
2126  */
2127 bool mem_cgroup_oom_synchronize(void)
2128 {
2129         struct oom_wait_info owait;
2130         struct mem_cgroup *memcg;
2131
2132         /* OOM is global, do not handle */
2133         if (!current->memcg_oom.in_memcg_oom)
2134                 return false;
2135
2136         /*
2137          * We invoked the OOM killer but there is a chance that a kill
2138          * did not free up any charges.  Everybody else might already
2139          * be sleeping, so restart the fault and keep the rampage
2140          * going until some charges are released.
2141          */
2142         memcg = current->memcg_oom.wait_on_memcg;
2143         if (!memcg)
2144                 goto out;
2145
2146         if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
2147                 goto out_memcg;
2148
2149         owait.memcg = memcg;
2150         owait.wait.flags = 0;
2151         owait.wait.func = memcg_oom_wake_function;
2152         owait.wait.private = current;
2153         INIT_LIST_HEAD(&owait.wait.task_list);
2154
2155         prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2156         /* Only sleep if we didn't miss any wakeups since OOM */
2157         if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
2158                 schedule();
2159         finish_wait(&memcg_oom_waitq, &owait.wait);
2160 out_memcg:
2161         mem_cgroup_unmark_under_oom(memcg);
2162         if (current->memcg_oom.oom_locked) {
2163                 mem_cgroup_oom_unlock(memcg);
2164                 /*
2165                  * There is no guarantee that an OOM-lock contender
2166                  * sees the wakeups triggered by the OOM kill
2167                  * uncharges.  Wake any sleepers explicitely.
2168                  */
2169                 memcg_oom_recover(memcg);
2170         }
2171         css_put(&memcg->css);
2172         current->memcg_oom.wait_on_memcg = NULL;
2173 out:
2174         current->memcg_oom.in_memcg_oom = 0;
2175         return true;
2176 }
2177
2178 /*
2179  * Currently used to update mapped file statistics, but the routine can be
2180  * generalized to update other statistics as well.
2181  *
2182  * Notes: Race condition
2183  *
2184  * We usually use page_cgroup_lock() for accessing page_cgroup member but
2185  * it tends to be costly. But considering some conditions, we doesn't need
2186  * to do so _always_.
2187  *
2188  * Considering "charge", lock_page_cgroup() is not required because all
2189  * file-stat operations happen after a page is attached to radix-tree. There
2190  * are no race with "charge".
2191  *
2192  * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
2193  * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
2194  * if there are race with "uncharge". Statistics itself is properly handled
2195  * by flags.
2196  *
2197  * Considering "move", this is an only case we see a race. To make the race
2198  * small, we check mm->moving_account and detect there are possibility of race
2199  * If there is, we take a lock.
2200  */
2201
2202 void __mem_cgroup_begin_update_page_stat(struct page *page,
2203                                 bool *locked, unsigned long *flags)
2204 {
2205         struct mem_cgroup *memcg;
2206         struct page_cgroup *pc;
2207
2208         pc = lookup_page_cgroup(page);
2209 again:
2210         memcg = pc->mem_cgroup;
2211         if (unlikely(!memcg || !PageCgroupUsed(pc)))
2212                 return;
2213         /*
2214          * If this memory cgroup is not under account moving, we don't
2215          * need to take move_lock_mem_cgroup(). Because we already hold
2216          * rcu_read_lock(), any calls to move_account will be delayed until
2217          * rcu_read_unlock() if mem_cgroup_stolen() == true.
2218          */
2219         if (!mem_cgroup_stolen(memcg))
2220                 return;
2221
2222         move_lock_mem_cgroup(memcg, flags);
2223         if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
2224                 move_unlock_mem_cgroup(memcg, flags);
2225                 goto again;
2226         }
2227         *locked = true;
2228 }
2229
2230 void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
2231 {
2232         struct page_cgroup *pc = lookup_page_cgroup(page);
2233
2234         /*
2235          * It's guaranteed that pc->mem_cgroup never changes while
2236          * lock is held because a routine modifies pc->mem_cgroup
2237          * should take move_lock_mem_cgroup().
2238          */
2239         move_unlock_mem_cgroup(pc->mem_cgroup, flags);
2240 }
2241
2242 void mem_cgroup_update_page_stat(struct page *page,
2243                                  enum mem_cgroup_stat_index idx, int val)
2244 {
2245         struct mem_cgroup *memcg;
2246         struct page_cgroup *pc = lookup_page_cgroup(page);
2247         unsigned long uninitialized_var(flags);
2248
2249         if (mem_cgroup_disabled())
2250                 return;
2251
2252         VM_BUG_ON(!rcu_read_lock_held());
2253         memcg = pc->mem_cgroup;
2254         if (unlikely(!memcg || !PageCgroupUsed(pc)))
2255                 return;
2256
2257         this_cpu_add(memcg->stat->count[idx], val);
2258 }
2259
2260 /*
2261  * size of first charge trial. "32" comes from vmscan.c's magic value.
2262  * TODO: maybe necessary to use big numbers in big irons.
2263  */
2264 #define CHARGE_BATCH    32U
2265 struct memcg_stock_pcp {
2266         struct mem_cgroup *cached; /* this never be root cgroup */
2267         unsigned int nr_pages;
2268         struct work_struct work;
2269         unsigned long flags;
2270 #define FLUSHING_CACHED_CHARGE  0
2271 };
2272 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2273 static DEFINE_MUTEX(percpu_charge_mutex);
2274
2275 /**
2276  * consume_stock: Try to consume stocked charge on this cpu.
2277  * @memcg: memcg to consume from.
2278  * @nr_pages: how many pages to charge.
2279  *
2280  * The charges will only happen if @memcg matches the current cpu's memcg
2281  * stock, and at least @nr_pages are available in that stock.  Failure to
2282  * service an allocation will refill the stock.
2283  *
2284  * returns true if successful, false otherwise.
2285  */
2286 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2287 {
2288         struct memcg_stock_pcp *stock;
2289         bool ret = true;
2290
2291         if (nr_pages > CHARGE_BATCH)
2292                 return false;
2293
2294         stock = &get_cpu_var(memcg_stock);
2295         if (memcg == stock->cached && stock->nr_pages >= nr_pages)
2296                 stock->nr_pages -= nr_pages;
2297         else /* need to call res_counter_charge */
2298                 ret = false;
2299         put_cpu_var(memcg_stock);
2300         return ret;
2301 }
2302
2303 /*
2304  * Returns stocks cached in percpu to res_counter and reset cached information.
2305  */
2306 static void drain_stock(struct memcg_stock_pcp *stock)
2307 {
2308         struct mem_cgroup *old = stock->cached;
2309
2310         if (stock->nr_pages) {
2311                 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
2312
2313                 res_counter_uncharge(&old->res, bytes);
2314                 if (do_swap_account)
2315                         res_counter_uncharge(&old->memsw, bytes);
2316                 stock->nr_pages = 0;
2317         }
2318         stock->cached = NULL;
2319 }
2320
2321 /*
2322  * This must be called under preempt disabled or must be called by
2323  * a thread which is pinned to local cpu.
2324  */
2325 static void drain_local_stock(struct work_struct *dummy)
2326 {
2327         struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
2328         drain_stock(stock);
2329         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2330 }
2331
2332 static void __init memcg_stock_init(void)
2333 {
2334         int cpu;
2335
2336         for_each_possible_cpu(cpu) {
2337                 struct memcg_stock_pcp *stock =
2338                                         &per_cpu(memcg_stock, cpu);
2339                 INIT_WORK(&stock->work, drain_local_stock);
2340         }
2341 }
2342
2343 /*
2344  * Cache charges(val) which is from res_counter, to local per_cpu area.
2345  * This will be consumed by consume_stock() function, later.
2346  */
2347 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2348 {
2349         struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2350
2351         if (stock->cached != memcg) { /* reset if necessary */
2352                 drain_stock(stock);
2353                 stock->cached = memcg;
2354         }
2355         stock->nr_pages += nr_pages;
2356         put_cpu_var(memcg_stock);
2357 }
2358
2359 /*
2360  * Drains all per-CPU charge caches for given root_memcg resp. subtree
2361  * of the hierarchy under it. sync flag says whether we should block
2362  * until the work is done.
2363  */
2364 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2365 {
2366         int cpu, curcpu;
2367
2368         /* Notify other cpus that system-wide "drain" is running */
2369         get_online_cpus();
2370         curcpu = get_cpu();
2371         for_each_online_cpu(cpu) {
2372                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2373                 struct mem_cgroup *memcg;
2374
2375                 memcg = stock->cached;
2376                 if (!memcg || !stock->nr_pages)
2377                         continue;
2378                 if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2379                         continue;
2380                 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2381                         if (cpu == curcpu)
2382                                 drain_local_stock(&stock->work);
2383                         else
2384                                 schedule_work_on(cpu, &stock->work);
2385                 }
2386         }
2387         put_cpu();
2388
2389         if (!sync)
2390                 goto out;
2391
2392         for_each_online_cpu(cpu) {
2393                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2394                 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2395                         flush_work(&stock->work);
2396         }
2397 out:
2398         put_online_cpus();
2399 }
2400
2401 /*
2402  * Tries to drain stocked charges in other cpus. This function is asynchronous
2403  * and just put a work per cpu for draining localy on each cpu. Caller can
2404  * expects some charges will be back to res_counter later but cannot wait for
2405  * it.
2406  */
2407 static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2408 {
2409         /*
2410          * If someone calls draining, avoid adding more kworker runs.
2411          */
2412         if (!mutex_trylock(&percpu_charge_mutex))
2413                 return;
2414         drain_all_stock(root_memcg, false);
2415         mutex_unlock(&percpu_charge_mutex);
2416 }
2417
2418 /* This is a synchronous drain interface. */
2419 static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2420 {
2421         /* called when force_empty is called */
2422         mutex_lock(&percpu_charge_mutex);
2423         drain_all_stock(root_memcg, true);
2424         mutex_unlock(&percpu_charge_mutex);
2425 }
2426
2427 /*
2428  * This function drains percpu counter value from DEAD cpu and
2429  * move it to local cpu. Note that this function can be preempted.
2430  */
2431 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2432 {
2433         int i;
2434
2435         spin_lock(&memcg->pcp_counter_lock);
2436         for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2437                 long x = per_cpu(memcg->stat->count[i], cpu);
2438
2439                 per_cpu(memcg->stat->count[i], cpu) = 0;
2440                 memcg->nocpu_base.count[i] += x;
2441         }
2442         for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2443                 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2444
2445                 per_cpu(memcg->stat->events[i], cpu) = 0;
2446                 memcg->nocpu_base.events[i] += x;
2447         }
2448         spin_unlock(&memcg->pcp_counter_lock);
2449 }
2450
2451 static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2452                                         unsigned long action,
2453                                         void *hcpu)
2454 {
2455         int cpu = (unsigned long)hcpu;
2456         struct memcg_stock_pcp *stock;
2457         struct mem_cgroup *iter;
2458
2459         if (action == CPU_ONLINE)
2460                 return NOTIFY_OK;
2461
2462         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
2463                 return NOTIFY_OK;
2464
2465         for_each_mem_cgroup(iter)
2466                 mem_cgroup_drain_pcp_counter(iter, cpu);
2467
2468         stock = &per_cpu(memcg_stock, cpu);
2469         drain_stock(stock);
2470         return NOTIFY_OK;
2471 }
2472
2473
2474 /* See __mem_cgroup_try_charge() for details */
2475 enum {
2476         CHARGE_OK,              /* success */
2477         CHARGE_RETRY,           /* need to retry but retry is not bad */
2478         CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
2479         CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
2480 };
2481
2482 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2483                                 unsigned int nr_pages, unsigned int min_pages,
2484                                 bool invoke_oom)
2485 {
2486         unsigned long csize = nr_pages * PAGE_SIZE;
2487         struct mem_cgroup *mem_over_limit;
2488         struct res_counter *fail_res;
2489         unsigned long flags = 0;
2490         int ret;
2491
2492         ret = res_counter_charge(&memcg->res, csize, &fail_res);
2493
2494         if (likely(!ret)) {
2495                 if (!do_swap_account)
2496                         return CHARGE_OK;
2497                 ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2498                 if (likely(!ret))
2499                         return CHARGE_OK;
2500
2501                 res_counter_uncharge(&memcg->res, csize);
2502                 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2503                 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2504         } else
2505                 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2506         /*
2507          * Never reclaim on behalf of optional batching, retry with a
2508          * single page instead.
2509          */
2510         if (nr_pages > min_pages)
2511                 return CHARGE_RETRY;
2512
2513         if (!(gfp_mask & __GFP_WAIT))
2514                 return CHARGE_WOULDBLOCK;
2515
2516         if (gfp_mask & __GFP_NORETRY)
2517                 return CHARGE_NOMEM;
2518
2519         ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2520         if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2521                 return CHARGE_RETRY;
2522         /*
2523          * Even though the limit is exceeded at this point, reclaim
2524          * may have been able to free some pages.  Retry the charge
2525          * before killing the task.
2526          *
2527          * Only for regular pages, though: huge pages are rather
2528          * unlikely to succeed so close to the limit, and we fall back
2529          * to regular pages anyway in case of failure.
2530          */
2531         if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
2532                 return CHARGE_RETRY;
2533
2534         /*
2535          * At task move, charge accounts can be doubly counted. So, it's
2536          * better to wait until the end of task_move if something is going on.
2537          */
2538         if (mem_cgroup_wait_acct_move(mem_over_limit))
2539                 return CHARGE_RETRY;
2540
2541         if (invoke_oom)
2542                 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
2543
2544         return CHARGE_NOMEM;
2545 }
2546
2547 /*
2548  * __mem_cgroup_try_charge() does
2549  * 1. detect memcg to be charged against from passed *mm and *ptr,
2550  * 2. update res_counter
2551  * 3. call memory reclaim if necessary.
2552  *
2553  * In some special case, if the task is fatal, fatal_signal_pending() or
2554  * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
2555  * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
2556  * as possible without any hazards. 2: all pages should have a valid
2557  * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
2558  * pointer, that is treated as a charge to root_mem_cgroup.
2559  *
2560  * So __mem_cgroup_try_charge() will return
2561  *  0       ...  on success, filling *ptr with a valid memcg pointer.
2562  *  -ENOMEM ...  charge failure because of resource limits.
2563  *  -EINTR  ...  if thread is fatal. *ptr is filled with root_mem_cgroup.
2564  *
2565  * Unlike the exported interface, an "oom" parameter is added. if oom==true,
2566  * the oom-killer can be invoked.
2567  */
2568 static int __mem_cgroup_try_charge(struct mm_struct *mm,
2569                                    gfp_t gfp_mask,
2570                                    unsigned int nr_pages,
2571                                    struct mem_cgroup **ptr,
2572                                    bool oom)
2573 {
2574         unsigned int batch = max(CHARGE_BATCH, nr_pages);
2575         int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2576         struct mem_cgroup *memcg = NULL;
2577         int ret;
2578
2579         /*
2580          * Unlike gloval-vm's OOM-kill, we're not in memory shortage
2581          * in system level. So, allow to go ahead dying process in addition to
2582          * MEMDIE process.
2583          */
2584         if (unlikely(test_thread_flag(TIF_MEMDIE)
2585                      || fatal_signal_pending(current)))
2586                 goto bypass;
2587
2588         /*
2589          * We always charge the cgroup the mm_struct belongs to.
2590          * The mm_struct's mem_cgroup changes on task migration if the
2591          * thread group leader migrates. It's possible that mm is not
2592          * set, if so charge the root memcg (happens for pagecache usage).
2593          */
2594         if (!*ptr && !mm)
2595                 *ptr = root_mem_cgroup;
2596 again:
2597         if (*ptr) { /* css should be a valid one */
2598                 memcg = *ptr;
2599                 if (mem_cgroup_is_root(memcg))
2600                         goto done;
2601                 if (consume_stock(memcg, nr_pages))
2602                         goto done;
2603                 css_get(&memcg->css);
2604         } else {
2605                 struct task_struct *p;
2606
2607                 rcu_read_lock();
2608                 p = rcu_dereference(mm->owner);
2609                 /*
2610                  * Because we don't have task_lock(), "p" can exit.
2611                  * In that case, "memcg" can point to root or p can be NULL with
2612                  * race with swapoff. Then, we have small risk of mis-accouning.
2613                  * But such kind of mis-account by race always happens because
2614                  * we don't have cgroup_mutex(). It's overkill and we allo that
2615                  * small race, here.
2616                  * (*) swapoff at el will charge against mm-struct not against
2617                  * task-struct. So, mm->owner can be NULL.
2618                  */
2619                 memcg = mem_cgroup_from_task(p);
2620                 if (!memcg)
2621                         memcg = root_mem_cgroup;
2622                 if (mem_cgroup_is_root(memcg)) {
2623                         rcu_read_unlock();
2624                         goto done;
2625                 }
2626                 if (consume_stock(memcg, nr_pages)) {
2627                         /*
2628                          * It seems dagerous to access memcg without css_get().
2629                          * But considering how consume_stok works, it's not
2630                          * necessary. If consume_stock success, some charges
2631                          * from this memcg are cached on this cpu. So, we
2632                          * don't need to call css_get()/css_tryget() before
2633                          * calling consume_stock().
2634                          */
2635                         rcu_read_unlock();
2636                         goto done;
2637                 }
2638                 /* after here, we may be blocked. we need to get refcnt */
2639                 if (!css_tryget(&memcg->css)) {
2640                         rcu_read_unlock();
2641                         goto again;
2642                 }
2643                 rcu_read_unlock();
2644         }
2645
2646         do {
2647                 bool invoke_oom = oom && !nr_oom_retries;
2648
2649                 /* If killed, bypass charge */
2650                 if (fatal_signal_pending(current)) {
2651                         css_put(&memcg->css);
2652                         goto bypass;
2653                 }
2654
2655                 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
2656                                            nr_pages, invoke_oom);
2657                 switch (ret) {
2658                 case CHARGE_OK:
2659                         break;
2660                 case CHARGE_RETRY: /* not in OOM situation but retry */
2661                         batch = nr_pages;
2662                         css_put(&memcg->css);
2663                         memcg = NULL;
2664                         goto again;
2665                 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
2666                         css_put(&memcg->css);
2667                         goto nomem;
2668                 case CHARGE_NOMEM: /* OOM routine works */
2669                         if (!oom || invoke_oom) {
2670                                 css_put(&memcg->css);
2671                                 goto nomem;
2672                         }
2673                         nr_oom_retries--;
2674                         break;
2675                 }
2676         } while (ret != CHARGE_OK);
2677
2678         if (batch > nr_pages)
2679                 refill_stock(memcg, batch - nr_pages);
2680         css_put(&memcg->css);
2681 done:
2682         *ptr = memcg;
2683         return 0;
2684 nomem:
2685         *ptr = NULL;
2686         return -ENOMEM;
2687 bypass:
2688         *ptr = root_mem_cgroup;
2689         return -EINTR;
2690 }
2691
2692 /*
2693  * Somemtimes we have to undo a charge we got by try_charge().
2694  * This function is for that and do uncharge, put css's refcnt.
2695  * gotten by try_charge().
2696  */
2697 static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2698                                        unsigned int nr_pages)
2699 {
2700         if (!mem_cgroup_is_root(memcg)) {
2701                 unsigned long bytes = nr_pages * PAGE_SIZE;
2702
2703                 res_counter_uncharge(&memcg->res, bytes);
2704                 if (do_swap_account)
2705                         res_counter_uncharge(&memcg->memsw, bytes);
2706         }
2707 }
2708
2709 /*
2710  * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
2711  * This is useful when moving usage to parent cgroup.
2712  */
2713 static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2714                                         unsigned int nr_pages)
2715 {
2716         unsigned long bytes = nr_pages * PAGE_SIZE;
2717
2718         if (mem_cgroup_is_root(memcg))
2719                 return;
2720
2721         res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2722         if (do_swap_account)
2723                 res_counter_uncharge_until(&memcg->memsw,
2724                                                 memcg->memsw.parent, bytes);
2725 }
2726
2727 /*
2728  * A helper function to get mem_cgroup from ID. must be called under
2729  * rcu_read_lock().  The caller is responsible for calling css_tryget if
2730  * the mem_cgroup is used for charging. (dropping refcnt from swap can be
2731  * called against removed memcg.)
2732  */
2733 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2734 {
2735         /* ID 0 is unused ID */
2736         if (!id)
2737                 return NULL;
2738         return mem_cgroup_from_id(id);
2739 }
2740
2741 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2742 {
2743         struct mem_cgroup *memcg = NULL;
2744         struct page_cgroup *pc;
2745         unsigned short id;
2746         swp_entry_t ent;
2747
2748         VM_BUG_ON(!PageLocked(page));
2749
2750         pc = lookup_page_cgroup(page);
2751         lock_page_cgroup(pc);
2752         if (PageCgroupUsed(pc)) {
2753                 memcg = pc->mem_cgroup;
2754                 if (memcg && !css_tryget(&memcg->css))
2755                         memcg = NULL;
2756         } else if (PageSwapCache(page)) {
2757                 ent.val = page_private(page);
2758                 id = lookup_swap_cgroup_id(ent);
2759                 rcu_read_lock();
2760                 memcg = mem_cgroup_lookup(id);
2761                 if (memcg && !css_tryget(&memcg->css))
2762                         memcg = NULL;
2763                 rcu_read_unlock();
2764         }
2765         unlock_page_cgroup(pc);
2766         return memcg;
2767 }
2768
2769 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2770                                        struct page *page,
2771                                        unsigned int nr_pages,
2772                                        enum charge_type ctype,
2773                                        bool lrucare)
2774 {
2775         struct page_cgroup *pc = lookup_page_cgroup(page);
2776         struct zone *uninitialized_var(zone);
2777         struct lruvec *lruvec;
2778         bool was_on_lru = false;
2779         bool anon;
2780
2781         lock_page_cgroup(pc);
2782         VM_BUG_ON(PageCgroupUsed(pc));
2783         /*
2784          * we don't need page_cgroup_lock about tail pages, becase they are not
2785          * accessed by any other context at this point.
2786          */
2787
2788         /*
2789          * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2790          * may already be on some other mem_cgroup's LRU.  Take care of it.
2791          */
2792         if (lrucare) {
2793                 zone = page_zone(page);
2794                 spin_lock_irq(&zone->lru_lock);
2795                 if (PageLRU(page)) {
2796                         lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2797                         ClearPageLRU(page);
2798                         del_page_from_lru_list(page, lruvec, page_lru(page));
2799                         was_on_lru = true;
2800                 }
2801         }
2802
2803         pc->mem_cgroup = memcg;
2804         /*
2805          * We access a page_cgroup asynchronously without lock_page_cgroup().
2806          * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
2807          * is accessed after testing USED bit. To make pc->mem_cgroup visible
2808          * before USED bit, we need memory barrier here.
2809          * See mem_cgroup_add_lru_list(), etc.
2810          */
2811         smp_wmb();
2812         SetPageCgroupUsed(pc);
2813
2814         if (lrucare) {
2815                 if (was_on_lru) {
2816                         lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2817                         VM_BUG_ON(PageLRU(page));
2818                         SetPageLRU(page);
2819                         add_page_to_lru_list(page, lruvec, page_lru(page));
2820                 }
2821                 spin_unlock_irq(&zone->lru_lock);
2822         }
2823
2824         if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
2825                 anon = true;
2826         else
2827                 anon = false;
2828
2829         mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
2830         unlock_page_cgroup(pc);
2831
2832         /*
2833          * "charge_statistics" updated event counter.
2834          */
2835         memcg_check_events(memcg, page);
2836 }
2837
2838 static DEFINE_MUTEX(set_limit_mutex);
2839
2840 #ifdef CONFIG_MEMCG_KMEM
2841 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
2842 {
2843         return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
2844                 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
2845 }
2846
2847 /*
2848  * This is a bit cumbersome, but it is rarely used and avoids a backpointer
2849  * in the memcg_cache_params struct.
2850  */
2851 static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2852 {
2853         struct kmem_cache *cachep;
2854
2855         VM_BUG_ON(p->is_root_cache);
2856         cachep = p->root_cache;
2857         return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
2858 }
2859
2860 #ifdef CONFIG_SLABINFO
2861 static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
2862                                     struct cftype *cft, struct seq_file *m)
2863 {
2864         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2865         struct memcg_cache_params *params;
2866
2867         if (!memcg_can_account_kmem(memcg))
2868                 return -EIO;
2869
2870         print_slabinfo_header(m);
2871
2872         mutex_lock(&memcg->slab_caches_mutex);
2873         list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2874                 cache_show(memcg_params_to_cache(params), m);
2875         mutex_unlock(&memcg->slab_caches_mutex);
2876
2877         return 0;
2878 }
2879 #endif
2880
2881 static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2882 {
2883         struct res_counter *fail_res;
2884         struct mem_cgroup *_memcg;
2885         int ret = 0;
2886         bool may_oom;
2887
2888         ret = res_counter_charge(&memcg->kmem, size, &fail_res);
2889         if (ret)
2890                 return ret;
2891
2892         /*
2893          * Conditions under which we can wait for the oom_killer. Those are
2894          * the same conditions tested by the core page allocator
2895          */
2896         may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
2897
2898         _memcg = memcg;
2899         ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
2900                                       &_memcg, may_oom);
2901
2902         if (ret == -EINTR)  {
2903                 /*
2904                  * __mem_cgroup_try_charge() chosed to bypass to root due to
2905                  * OOM kill or fatal signal.  Since our only options are to
2906                  * either fail the allocation or charge it to this cgroup, do
2907                  * it as a temporary condition. But we can't fail. From a
2908                  * kmem/slab perspective, the cache has already been selected,
2909                  * by mem_cgroup_kmem_get_cache(), so it is too late to change
2910                  * our minds.
2911                  *
2912                  * This condition will only trigger if the task entered
2913                  * memcg_charge_kmem in a sane state, but was OOM-killed during
2914                  * __mem_cgroup_try_charge() above. Tasks that were already
2915                  * dying when the allocation triggers should have been already
2916                  * directed to the root cgroup in memcontrol.h
2917                  */
2918                 res_counter_charge_nofail(&memcg->res, size, &fail_res);
2919                 if (do_swap_account)
2920                         res_counter_charge_nofail(&memcg->memsw, size,
2921                                                   &fail_res);
2922                 ret = 0;
2923         } else if (ret)
2924                 res_counter_uncharge(&memcg->kmem, size);
2925
2926         return ret;
2927 }
2928
2929 static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
2930 {
2931         res_counter_uncharge(&memcg->res, size);
2932         if (do_swap_account)
2933                 res_counter_uncharge(&memcg->memsw, size);
2934
2935         /* Not down to 0 */
2936         if (res_counter_uncharge(&memcg->kmem, size))
2937                 return;
2938
2939         /*
2940          * Releases a reference taken in kmem_cgroup_css_offline in case
2941          * this last uncharge is racing with the offlining code or it is
2942          * outliving the memcg existence.
2943          *
2944          * The memory barrier imposed by test&clear is paired with the
2945          * explicit one in memcg_kmem_mark_dead().
2946          */
2947         if (memcg_kmem_test_and_clear_dead(memcg))
2948                 css_put(&memcg->css);
2949 }
2950
2951 void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
2952 {
2953         if (!memcg)
2954                 return;
2955
2956         mutex_lock(&memcg->slab_caches_mutex);
2957         list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
2958         mutex_unlock(&memcg->slab_caches_mutex);
2959 }
2960
2961 /*
2962  * helper for acessing a memcg's index. It will be used as an index in the
2963  * child cache array in kmem_cache, and also to derive its name. This function
2964  * will return -1 when this is not a kmem-limited memcg.
2965  */
2966 int memcg_cache_id(struct mem_cgroup *memcg)
2967 {
2968         return memcg ? memcg->kmemcg_id : -1;
2969 }
2970
2971 /*
2972  * This ends up being protected by the set_limit mutex, during normal
2973  * operation, because that is its main call site.
2974  *
2975  * But when we create a new cache, we can call this as well if its parent
2976  * is kmem-limited. That will have to hold set_limit_mutex as well.
2977  */
2978 int memcg_update_cache_sizes(struct mem_cgroup *memcg)
2979 {
2980         int num, ret;
2981
2982         num = ida_simple_get(&kmem_limited_groups,
2983                                 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2984         if (num < 0)
2985                 return num;
2986         /*
2987          * After this point, kmem_accounted (that we test atomically in
2988          * the beginning of this conditional), is no longer 0. This
2989          * guarantees only one process will set the following boolean
2990          * to true. We don't need test_and_set because we're protected
2991          * by the set_limit_mutex anyway.
2992          */
2993         memcg_kmem_set_activated(memcg);
2994
2995         ret = memcg_update_all_caches(num+1);
2996         if (ret) {
2997                 ida_simple_remove(&kmem_limited_groups, num);
2998                 memcg_kmem_clear_activated(memcg);
2999                 return ret;
3000         }
3001
3002         memcg->kmemcg_id = num;
3003         INIT_LIST_HEAD(&memcg->memcg_slab_caches);
3004         mutex_init(&memcg->slab_caches_mutex);
3005         return 0;
3006 }
3007
3008 static size_t memcg_caches_array_size(int num_groups)
3009 {
3010         ssize_t size;
3011         if (num_groups <= 0)
3012                 return 0;
3013
3014         size = 2 * num_groups;
3015         if (size < MEMCG_CACHES_MIN_SIZE)
3016                 size = MEMCG_CACHES_MIN_SIZE;
3017         else if (size > MEMCG_CACHES_MAX_SIZE)
3018                 size = MEMCG_CACHES_MAX_SIZE;
3019
3020         return size;
3021 }
3022
3023 /*
3024  * We should update the current array size iff all caches updates succeed. This
3025  * can only be done from the slab side. The slab mutex needs to be held when
3026  * calling this.
3027  */
3028 void memcg_update_array_size(int num)
3029 {
3030         if (num > memcg_limited_groups_array_size)
3031                 memcg_limited_groups_array_size = memcg_caches_array_size(num);
3032 }
3033
3034 static void kmem_cache_destroy_work_func(struct work_struct *w);
3035
3036 int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3037 {
3038         struct memcg_cache_params *cur_params = s->memcg_params;
3039
3040         VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
3041
3042         if (num_groups > memcg_limited_groups_array_size) {
3043                 int i;
3044                 ssize_t size = memcg_caches_array_size(num_groups);
3045
3046                 size *= sizeof(void *);
3047                 size += offsetof(struct memcg_cache_params, memcg_caches);
3048
3049                 s->memcg_params = kzalloc(size, GFP_KERNEL);
3050                 if (!s->memcg_params) {
3051                         s->memcg_params = cur_params;
3052                         return -ENOMEM;
3053                 }
3054
3055                 s->memcg_params->is_root_cache = true;
3056
3057                 /*
3058                  * There is the chance it will be bigger than
3059                  * memcg_limited_groups_array_size, if we failed an allocation
3060                  * in a cache, in which case all caches updated before it, will
3061                  * have a bigger array.
3062                  *
3063                  * But if that is the case, the data after
3064                  * memcg_limited_groups_array_size is certainly unused
3065                  */
3066                 for (i = 0; i < memcg_limited_groups_array_size; i++) {
3067                         if (!cur_params->memcg_caches[i])
3068                                 continue;
3069                         s->memcg_params->memcg_caches[i] =
3070                                                 cur_params->memcg_caches[i];
3071                 }
3072
3073                 /*
3074                  * Ideally, we would wait until all caches succeed, and only
3075                  * then free the old one. But this is not worth the extra
3076                  * pointer per-cache we'd have to have for this.
3077                  *
3078                  * It is not a big deal if some caches are left with a size
3079                  * bigger than the others. And all updates will reset this
3080                  * anyway.
3081                  */
3082                 kfree(cur_params);
3083         }
3084         return 0;
3085 }
3086
3087 int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
3088                          struct kmem_cache *root_cache)
3089 {
3090         size_t size;
3091
3092         if (!memcg_kmem_enabled())
3093                 return 0;
3094
3095         if (!memcg) {
3096                 size = offsetof(struct memcg_cache_params, memcg_caches);
3097                 size += memcg_limited_groups_array_size * sizeof(void *);
3098         } else
3099                 size = sizeof(struct memcg_cache_params);
3100
3101         s->memcg_params = kzalloc(size, GFP_KERNEL);
3102         if (!s->memcg_params)
3103                 return -ENOMEM;
3104
3105         if (memcg) {
3106                 s->memcg_params->memcg = memcg;
3107                 s->memcg_params->root_cache = root_cache;
3108                 INIT_WORK(&s->memcg_params->destroy,
3109                                 kmem_cache_destroy_work_func);
3110         } else
3111                 s->memcg_params->is_root_cache = true;
3112
3113         return 0;
3114 }
3115
3116 void memcg_release_cache(struct kmem_cache *s)
3117 {
3118         struct kmem_cache *root;
3119         struct mem_cgroup *memcg;
3120         int id;
3121
3122         /*
3123          * This happens, for instance, when a root cache goes away before we
3124          * add any memcg.
3125          */
3126         if (!s->memcg_params)
3127                 return;
3128
3129         if (s->memcg_params->is_root_cache)
3130                 goto out;
3131
3132         memcg = s->memcg_params->memcg;
3133         id  = memcg_cache_id(memcg);
3134
3135         root = s->memcg_params->root_cache;
3136         root->memcg_params->memcg_caches[id] = NULL;
3137
3138         mutex_lock(&memcg->slab_caches_mutex);
3139         list_del(&s->memcg_params->list);
3140         mutex_unlock(&memcg->slab_caches_mutex);
3141
3142         css_put(&memcg->css);
3143 out:
3144         kfree(s->memcg_params);
3145 }
3146
3147 /*
3148  * During the creation a new cache, we need to disable our accounting mechanism
3149  * altogether. This is true even if we are not creating, but rather just
3150  * enqueing new caches to be created.
3151  *
3152  * This is because that process will trigger allocations; some visible, like
3153  * explicit kmallocs to auxiliary data structures, name strings and internal
3154  * cache structures; some well concealed, like INIT_WORK() that can allocate
3155  * objects during debug.
3156  *
3157  * If any allocation happens during memcg_kmem_get_cache, we will recurse back
3158  * to it. This may not be a bounded recursion: since the first cache creation
3159  * failed to complete (waiting on the allocation), we'll just try to create the
3160  * cache again, failing at the same point.
3161  *
3162  * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
3163  * memcg_kmem_skip_account. So we enclose anything that might allocate memory
3164  * inside the following two functions.
3165  */
3166 static inline void memcg_stop_kmem_account(void)
3167 {
3168         VM_BUG_ON(!current->mm);
3169         current->memcg_kmem_skip_account++;
3170 }
3171
3172 static inline void memcg_resume_kmem_account(void)
3173 {
3174         VM_BUG_ON(!current->mm);
3175         current->memcg_kmem_skip_account--;
3176 }
3177
3178 static void kmem_cache_destroy_work_func(struct work_struct *w)
3179 {
3180         struct kmem_cache *cachep;
3181         struct memcg_cache_params *p;
3182
3183         p = container_of(w, struct memcg_cache_params, destroy);
3184
3185         cachep = memcg_params_to_cache(p);
3186
3187         /*
3188          * If we get down to 0 after shrink, we could delete right away.
3189          * However, memcg_release_pages() already puts us back in the workqueue
3190          * in that case. If we proceed deleting, we'll get a dangling
3191          * reference, and removing the object from the workqueue in that case
3192          * is unnecessary complication. We are not a fast path.
3193          *
3194          * Note that this case is fundamentally different from racing with
3195          * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
3196          * kmem_cache_shrink, not only we would be reinserting a dead cache
3197          * into the queue, but doing so from inside the worker racing to
3198          * destroy it.
3199          *
3200          * So if we aren't down to zero, we'll just schedule a worker and try
3201          * again
3202          */
3203         if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
3204                 kmem_cache_shrink(cachep);
3205                 if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
3206                         return;
3207         } else
3208                 kmem_cache_destroy(cachep);
3209 }
3210
3211 void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3212 {
3213         if (!cachep->memcg_params->dead)
3214                 return;
3215
3216         /*
3217          * There are many ways in which we can get here.
3218          *
3219          * We can get to a memory-pressure situation while the delayed work is
3220          * still pending to run. The vmscan shrinkers can then release all
3221          * cache memory and get us to destruction. If this is the case, we'll
3222          * be executed twice, which is a bug (the second time will execute over
3223          * bogus data). In this case, cancelling the work should be fine.
3224          *
3225          * But we can also get here from the worker itself, if
3226          * kmem_cache_shrink is enough to shake all the remaining objects and
3227          * get the page count to 0. In this case, we'll deadlock if we try to
3228          * cancel the work (the worker runs with an internal lock held, which
3229          * is the same lock we would hold for cancel_work_sync().)
3230          *
3231          * Since we can't possibly know who got us here, just refrain from
3232          * running if there is already work pending
3233          */
3234         if (work_pending(&cachep->memcg_params->destroy))
3235                 return;
3236         /*
3237          * We have to defer the actual destroying to a workqueue, because
3238          * we might currently be in a context that cannot sleep.
3239          */
3240         schedule_work(&cachep->memcg_params->destroy);
3241 }
3242
3243 /*
3244  * This lock protects updaters, not readers. We want readers to be as fast as
3245  * they can, and they will either see NULL or a valid cache value. Our model
3246  * allow them to see NULL, in which case the root memcg will be selected.
3247  *
3248  * We need this lock because multiple allocations to the same cache from a non
3249  * will span more than one worker. Only one of them can create the cache.
3250  */
3251 static DEFINE_MUTEX(memcg_cache_mutex);
3252
3253 /*
3254  * Called with memcg_cache_mutex held
3255  */
3256 static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
3257                                          struct kmem_cache *s)
3258 {
3259         struct kmem_cache *new;
3260         static char *tmp_name = NULL;
3261
3262         lockdep_assert_held(&memcg_cache_mutex);
3263
3264         /*
3265          * kmem_cache_create_memcg duplicates the given name and
3266          * cgroup_name for this name requires RCU context.
3267          * This static temporary buffer is used to prevent from
3268          * pointless shortliving allocation.
3269          */
3270         if (!tmp_name) {
3271                 tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
3272                 if (!tmp_name)
3273                         return NULL;
3274         }
3275
3276         rcu_read_lock();
3277         snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
3278                          memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
3279         rcu_read_unlock();
3280
3281         new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
3282                                       (s->flags & ~SLAB_PANIC), s->ctor, s);
3283
3284         if (new)
3285                 new->allocflags |= __GFP_KMEMCG;
3286
3287         return new;
3288 }
3289
3290 static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3291                                                   struct kmem_cache *cachep)
3292 {
3293         struct kmem_cache *new_cachep;
3294         int idx;
3295
3296         BUG_ON(!memcg_can_account_kmem(memcg));
3297
3298         idx = memcg_cache_id(memcg);
3299
3300         mutex_lock(&memcg_cache_mutex);
3301         new_cachep = cachep->memcg_params->memcg_caches[idx];
3302         if (new_cachep) {
3303                 css_put(&memcg->css);
3304                 goto out;
3305         }
3306
3307         new_cachep = kmem_cache_dup(memcg, cachep);
3308         if (new_cachep == NULL) {
3309                 new_cachep = cachep;
3310                 css_put(&memcg->css);
3311                 goto out;
3312         }
3313
3314         atomic_set(&new_cachep->memcg_params->nr_pages , 0);
3315
3316         cachep->memcg_params->memcg_caches[idx] = new_cachep;
3317         /*
3318          * the readers won't lock, make sure everybody sees the updated value,
3319          * so they won't put stuff in the queue again for no reason
3320          */
3321         wmb();
3322 out:
3323         mutex_unlock(&memcg_cache_mutex);
3324         return new_cachep;
3325 }
3326
3327 void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3328 {
3329         struct kmem_cache *c;
3330         int i;
3331
3332         if (!s->memcg_params)
3333                 return;
3334         if (!s->memcg_params->is_root_cache)
3335                 return;
3336
3337         /*
3338          * If the cache is being destroyed, we trust that there is no one else
3339          * requesting objects from it. Even if there are, the sanity checks in
3340          * kmem_cache_destroy should caught this ill-case.
3341          *
3342          * Still, we don't want anyone else freeing memcg_caches under our
3343          * noses, which can happen if a new memcg comes to life. As usual,
3344          * we'll take the set_limit_mutex to protect ourselves against this.
3345          */
3346         mutex_lock(&set_limit_mutex);
3347         for (i = 0; i < memcg_limited_groups_array_size; i++) {
3348                 c = s->memcg_params->memcg_caches[i];
3349                 if (!c)
3350                         continue;
3351
3352                 /*
3353                  * We will now manually delete the caches, so to avoid races
3354                  * we need to cancel all pending destruction workers and
3355                  * proceed with destruction ourselves.
3356                  *
3357                  * kmem_cache_destroy() will call kmem_cache_shrink internally,
3358                  * and that could spawn the workers again: it is likely that
3359                  * the cache still have active pages until this very moment.
3360                  * This would lead us back to mem_cgroup_destroy_cache.
3361                  *
3362                  * But that will not execute at all if the "dead" flag is not
3363                  * set, so flip it down to guarantee we are in control.
3364                  */
3365                 c->memcg_params->dead = false;
3366                 cancel_work_sync(&c->memcg_params->destroy);
3367                 kmem_cache_destroy(c);
3368         }
3369         mutex_unlock(&set_limit_mutex);
3370 }
3371
3372 struct create_work {
3373         struct mem_cgroup *memcg;
3374         struct kmem_cache *cachep;
3375         struct work_struct work;
3376 };
3377
3378 static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3379 {
3380         struct kmem_cache *cachep;
3381         struct memcg_cache_params *params;
3382
3383         if (!memcg_kmem_is_active(memcg))
3384                 return;
3385
3386         mutex_lock(&memcg->slab_caches_mutex);
3387         list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
3388                 cachep = memcg_params_to_cache(params);
3389                 cachep->memcg_params->dead = true;
3390                 schedule_work(&cachep->memcg_params->destroy);
3391         }
3392         mutex_unlock(&memcg->slab_caches_mutex);
3393 }
3394
3395 static void memcg_create_cache_work_func(struct work_struct *w)
3396 {
3397         struct create_work *cw;
3398
3399         cw = container_of(w, struct create_work, work);
3400         memcg_create_kmem_cache(cw->memcg, cw->cachep);
3401         kfree(cw);
3402 }
3403
3404 /*
3405  * Enqueue the creation of a per-memcg kmem_cache.
3406  */
3407 static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3408                                          struct kmem_cache *cachep)
3409 {
3410         struct create_work *cw;
3411
3412         cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
3413         if (cw == NULL) {
3414                 css_put(&memcg->css);
3415                 return;
3416         }
3417
3418         cw->memcg = memcg;
3419         cw->cachep = cachep;
3420
3421         INIT_WORK(&cw->work, memcg_create_cache_work_func);
3422         schedule_work(&cw->work);
3423 }
3424
3425 static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3426                                        struct kmem_cache *cachep)
3427 {
3428         /*
3429          * We need to stop accounting when we kmalloc, because if the
3430          * corresponding kmalloc cache is not yet created, the first allocation
3431          * in __memcg_create_cache_enqueue will recurse.
3432          *
3433          * However, it is better to enclose the whole function. Depending on
3434          * the debugging options enabled, INIT_WORK(), for instance, can
3435          * trigger an allocation. This too, will make us recurse. Because at
3436          * this point we can't allow ourselves back into memcg_kmem_get_cache,
3437          * the safest choice is to do it like this, wrapping the whole function.
3438          */
3439         memcg_stop_kmem_account();
3440         __memcg_create_cache_enqueue(memcg, cachep);
3441         memcg_resume_kmem_account();
3442 }
3443 /*
3444  * Return the kmem_cache we're supposed to use for a slab allocation.
3445  * We try to use the current memcg's version of the cache.
3446  *
3447  * If the cache does not exist yet, if we are the first user of it,
3448  * we either create it immediately, if possible, or create it asynchronously
3449  * in a workqueue.
3450  * In the latter case, we will let the current allocation go through with
3451  * the original cache.
3452  *
3453  * Can't be called in interrupt context or from kernel threads.
3454  * This function needs to be called with rcu_read_lock() held.
3455  */
3456 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3457                                           gfp_t gfp)
3458 {
3459         struct mem_cgroup *memcg;
3460         int idx;
3461
3462         VM_BUG_ON(!cachep->memcg_params);
3463         VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3464
3465         if (!current->mm || current->memcg_kmem_skip_account)
3466                 return cachep;
3467
3468         rcu_read_lock();
3469         memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3470
3471         if (!memcg_can_account_kmem(memcg))
3472                 goto out;
3473
3474         idx = memcg_cache_id(memcg);
3475
3476         /*
3477          * barrier to mare sure we're always seeing the up to date value.  The
3478          * code updating memcg_caches will issue a write barrier to match this.
3479          */
3480         read_barrier_depends();
3481         if (likely(cachep->memcg_params->memcg_caches[idx])) {
3482                 cachep = cachep->memcg_params->memcg_caches[idx];
3483                 goto out;
3484         }
3485
3486         /* The corresponding put will be done in the workqueue. */
3487         if (!css_tryget(&memcg->css))
3488                 goto out;
3489         rcu_read_unlock();
3490
3491         /*
3492          * If we are in a safe context (can wait, and not in interrupt
3493          * context), we could be be predictable and return right away.
3494          * This would guarantee that the allocation being performed
3495          * already belongs in the new cache.
3496          *
3497          * However, there are some clashes that can arrive from locking.
3498          * For instance, because we acquire the slab_mutex while doing
3499          * kmem_cache_dup, this means no further allocation could happen
3500          * with the slab_mutex held.
3501          *
3502          * Also, because cache creation issue get_online_cpus(), this
3503          * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
3504          * that ends up reversed during cpu hotplug. (cpuset allocates
3505          * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
3506          * better to defer everything.
3507          */
3508         memcg_create_cache_enqueue(memcg, cachep);
3509         return cachep;
3510 out:
3511         rcu_read_unlock();
3512         return cachep;
3513 }
3514 EXPORT_SYMBOL(__memcg_kmem_get_cache);
3515
3516 /*
3517  * We need to verify if the allocation against current->mm->owner's memcg is
3518  * possible for the given order. But the page is not allocated yet, so we'll
3519  * need a further commit step to do the final arrangements.
3520  *
3521  * It is possible for the task to switch cgroups in this mean time, so at
3522  * commit time, we can't rely on task conversion any longer.  We'll then use
3523  * the handle argument to return to the caller which cgroup we should commit
3524  * against. We could also return the memcg directly and avoid the pointer
3525  * passing, but a boolean return value gives better semantics considering
3526  * the compiled-out case as well.
3527  *
3528  * Returning true means the allocation is possible.
3529  */
3530 bool
3531 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3532 {
3533         struct mem_cgroup *memcg;
3534         int ret;
3535
3536         *_memcg = NULL;
3537
3538         /*
3539          * Disabling accounting is only relevant for some specific memcg
3540          * internal allocations. Therefore we would initially not have such
3541          * check here, since direct calls to the page allocator that are marked
3542          * with GFP_KMEMCG only happen outside memcg core. We are mostly
3543          * concerned with cache allocations, and by having this test at
3544          * memcg_kmem_get_cache, we are already able to relay the allocation to
3545          * the root cache and bypass the memcg cache altogether.
3546          *
3547          * There is one exception, though: the SLUB allocator does not create
3548          * large order caches, but rather service large kmallocs directly from
3549          * the page allocator. Therefore, the following sequence when backed by
3550          * the SLUB allocator:
3551          *
3552          *      memcg_stop_kmem_account();
3553          *      kmalloc(<large_number>)
3554          *      memcg_resume_kmem_account();
3555          *
3556          * would effectively ignore the fact that we should skip accounting,
3557          * since it will drive us directly to this function without passing
3558          * through the cache selector memcg_kmem_get_cache. Such large
3559          * allocations are extremely rare but can happen, for instance, for the
3560          * cache arrays. We bring this test here.
3561          */
3562         if (!current->mm || current->memcg_kmem_skip_account)
3563                 return true;
3564
3565         memcg = try_get_mem_cgroup_from_mm(current->mm);
3566
3567         /*
3568          * very rare case described in mem_cgroup_from_task. Unfortunately there
3569          * isn't much we can do without complicating this too much, and it would
3570          * be gfp-dependent anyway. Just let it go
3571          */
3572         if (unlikely(!memcg))
3573                 return true;
3574
3575         if (!memcg_can_account_kmem(memcg)) {
3576                 css_put(&memcg->css);
3577                 return true;
3578         }
3579
3580         ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
3581         if (!ret)
3582                 *_memcg = memcg;
3583
3584         css_put(&memcg->css);
3585         return (ret == 0);
3586 }
3587
3588 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3589                               int order)
3590 {
3591         struct page_cgroup *pc;
3592
3593         VM_BUG_ON(mem_cgroup_is_root(memcg));
3594
3595         /* The page allocation failed. Revert */
3596         if (!page) {
3597                 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3598                 return;
3599         }
3600
3601         pc = lookup_page_cgroup(page);
3602         lock_page_cgroup(pc);
3603         pc->mem_cgroup = memcg;
3604         SetPageCgroupUsed(pc);
3605         unlock_page_cgroup(pc);
3606 }
3607
3608 void __memcg_kmem_uncharge_pages(struct page *page, int order)
3609 {
3610         struct mem_cgroup *memcg = NULL;
3611         struct page_cgroup *pc;
3612
3613
3614         pc = lookup_page_cgroup(page);
3615         /*
3616          * Fast unlocked return. Theoretically might have changed, have to
3617          * check again after locking.
3618          */
3619         if (!PageCgroupUsed(pc))
3620                 return;
3621
3622         lock_page_cgroup(pc);
3623         if (PageCgroupUsed(pc)) {
3624                 memcg = pc->mem_cgroup;
3625                 ClearPageCgroupUsed(pc);
3626         }
3627         unlock_page_cgroup(pc);
3628
3629         /*
3630          * We trust that only if there is a memcg associated with the page, it
3631          * is a valid allocation
3632          */
3633         if (!memcg)
3634                 return;
3635
3636         VM_BUG_ON(mem_cgroup_is_root(memcg));
3637         memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3638 }
3639 #else
3640 static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3641 {
3642 }
3643 #endif /* CONFIG_MEMCG_KMEM */
3644
3645 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3646
3647 #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
3648 /*
3649  * Because tail pages are not marked as "used", set it. We're under
3650  * zone->lru_lock, 'splitting on pmd' and compound_lock.
3651  * charge/uncharge will be never happen and move_account() is done under
3652  * compound_lock(), so we don't have to take care of races.
3653  */
3654 void mem_cgroup_split_huge_fixup(struct page *head)
3655 {
3656         struct page_cgroup *head_pc = lookup_page_cgroup(head);
3657         struct page_cgroup *pc;
3658         struct mem_cgroup *memcg;
3659         int i;
3660
3661         if (mem_cgroup_disabled())
3662                 return;
3663
3664         memcg = head_pc->mem_cgroup;
3665         for (i = 1; i < HPAGE_PMD_NR; i++) {
3666                 pc = head_pc + i;
3667                 pc->mem_cgroup = memcg;
3668                 smp_wmb();/* see __commit_charge() */
3669                 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
3670         }
3671         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
3672                        HPAGE_PMD_NR);
3673 }
3674 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3675
3676 static inline
3677 void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
3678                                         struct mem_cgroup *to,
3679                                         unsigned int nr_pages,
3680                                         enum mem_cgroup_stat_index idx)
3681 {
3682         /* Update stat data for mem_cgroup */
3683         preempt_disable();
3684         WARN_ON_ONCE(from->stat->count[idx] < nr_pages);
3685         __this_cpu_add(from->stat->count[idx], -nr_pages);
3686         __this_cpu_add(to->stat->count[idx], nr_pages);
3687         preempt_enable();
3688 }
3689
3690 /**
3691  * mem_cgroup_move_account - move account of the page
3692  * @page: the page
3693  * @nr_pages: number of regular pages (>1 for huge pages)
3694  * @pc: page_cgroup of the page.
3695  * @from: mem_cgroup which the page is moved from.
3696  * @to: mem_cgroup which the page is moved to. @from != @to.
3697  *
3698  * The caller must confirm following.
3699  * - page is not on LRU (isolate_page() is useful.)
3700  * - compound_lock is held when nr_pages > 1
3701  *
3702  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
3703  * from old cgroup.
3704  */
3705 static int mem_cgroup_move_account(struct page *page,
3706                                    unsigned int nr_pages,
3707                                    struct page_cgroup *pc,
3708                                    struct mem_cgroup *from,
3709                                    struct mem_cgroup *to)
3710 {
3711         unsigned long flags;
3712         int ret;
3713         bool anon = PageAnon(page);
3714
3715         VM_BUG_ON(from == to);
3716         VM_BUG_ON(PageLRU(page));
3717         /*
3718          * The page is isolated from LRU. So, collapse function
3719          * will not handle this page. But page splitting can happen.
3720          * Do this check under compound_page_lock(). The caller should
3721          * hold it.
3722          */
3723         ret = -EBUSY;
3724         if (nr_pages > 1 && !PageTransHuge(page))
3725                 goto out;
3726
3727         lock_page_cgroup(pc);
3728
3729         ret = -EINVAL;
3730         if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
3731                 goto unlock;
3732
3733         move_lock_mem_cgroup(from, &flags);
3734
3735         if (!anon && page_mapped(page))
3736                 mem_cgroup_move_account_page_stat(from, to, nr_pages,
3737                         MEM_CGROUP_STAT_FILE_MAPPED);
3738
3739         if (PageWriteback(page))
3740                 mem_cgroup_move_account_page_stat(from, to, nr_pages,
3741                         MEM_CGROUP_STAT_WRITEBACK);
3742
3743         mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
3744
3745         /* caller should have done css_get */
3746         pc->mem_cgroup = to;
3747         mem_cgroup_charge_statistics(to, page, anon, nr_pages);
3748         move_unlock_mem_cgroup(from, &flags);
3749         ret = 0;
3750 unlock:
3751         unlock_page_cgroup(pc);
3752         /*
3753          * check events
3754          */
3755         memcg_check_events(to, page);
3756         memcg_check_events(from, page);
3757 out:
3758         return ret;
3759 }
3760
3761 /**
3762  * mem_cgroup_move_parent - moves page to the parent group
3763  * @page: the page to move
3764  * @pc: page_cgroup of the page
3765  * @child: page's cgroup
3766  *
3767  * move charges to its parent or the root cgroup if the group has no
3768  * parent (aka use_hierarchy==0).
3769  * Although this might fail (get_page_unless_zero, isolate_lru_page or
3770  * mem_cgroup_move_account fails) the failure is always temporary and
3771  * it signals a race with a page removal/uncharge or migration. In the
3772  * first case the page is on the way out and it will vanish from the LRU
3773  * on the next attempt and the call should be retried later.
3774  * Isolation from the LRU fails only if page has been isolated from
3775  * the LRU since we looked at it and that usually means either global
3776  * reclaim or migration going on. The page will either get back to the
3777  * LRU or vanish.
3778  * Finaly mem_cgroup_move_account fails only if the page got uncharged
3779  * (!PageCgroupUsed) or moved to a different group. The page will
3780  * disappear in the next attempt.
3781  */
3782 static int mem_cgroup_move_parent(struct page *page,
3783                                   struct page_cgroup *pc,
3784                                   struct mem_cgroup *child)
3785 {
3786         struct mem_cgroup *parent;
3787         unsigned int nr_pages;
3788         unsigned long uninitialized_var(flags);
3789         int ret;
3790
3791         VM_BUG_ON(mem_cgroup_is_root(child));
3792
3793         ret = -EBUSY;
3794         if (!get_page_unless_zero(page))
3795                 goto out;
3796         if (isolate_lru_page(page))
3797                 goto put;
3798
3799         nr_pages = hpage_nr_pages(page);
3800
3801         parent = parent_mem_cgroup(child);
3802         /*
3803          * If no parent, move charges to root cgroup.
3804          */
3805         if (!parent)
3806                 parent = root_mem_cgroup;
3807
3808         if (nr_pages > 1) {
3809                 VM_BUG_ON(!PageTransHuge(page));
3810                 flags = compound_lock_irqsave(page);
3811         }
3812
3813         ret = mem_cgroup_move_account(page, nr_pages,
3814                                 pc, child, parent);
3815         if (!ret)
3816                 __mem_cgroup_cancel_local_charge(child, nr_pages);
3817
3818         if (nr_pages > 1)
3819                 compound_unlock_irqrestore(page, flags);
3820         putback_lru_page(page);
3821 put:
3822         put_page(page);
3823 out:
3824         return ret;
3825 }
3826
3827 /*
3828  * Charge the memory controller for page usage.
3829  * Return
3830  * 0 if the charge was successful
3831  * < 0 if the cgroup is over its limit
3832  */
3833 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
3834                                 gfp_t gfp_mask, enum charge_type ctype)
3835 {
3836         struct mem_cgroup *memcg = NULL;
3837         unsigned int nr_pages = 1;
3838         bool oom = true;
3839         int ret;
3840
3841         if (PageTransHuge(page)) {
3842                 nr_pages <<= compound_order(page);
3843                 VM_BUG_ON(!PageTransHuge(page));
3844                 /*
3845                  * Never OOM-kill a process for a huge page.  The
3846                  * fault handler will fall back to regular pages.
3847                  */
3848                 oom = false;
3849         }
3850
3851         ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
3852         if (ret == -ENOMEM)
3853                 return ret;
3854         __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
3855         return 0;
3856 }
3857
3858 int mem_cgroup_newpage_charge(struct page *page,
3859                               struct mm_struct *mm, gfp_t gfp_mask)
3860 {
3861         if (mem_cgroup_disabled())
3862                 return 0;
3863         VM_BUG_ON(page_mapped(page));
3864         VM_BUG_ON(page->mapping && !PageAnon(page));
3865         VM_BUG_ON(!mm);
3866         return mem_cgroup_charge_common(page, mm, gfp_mask,
3867                                         MEM_CGROUP_CHARGE_TYPE_ANON);
3868 }
3869
3870 /*
3871  * While swap-in, try_charge -> commit or cancel, the page is locked.
3872  * And when try_charge() successfully returns, one refcnt to memcg without
3873  * struct page_cgroup is acquired. This refcnt will be consumed by
3874  * "commit()" or removed by "cancel()"
3875  */
3876 static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
3877                                           struct page *page,
3878                                           gfp_t mask,
3879                                           struct mem_cgroup **memcgp)
3880 {
3881         struct mem_cgroup *memcg;
3882         struct page_cgroup *pc;
3883         int ret;
3884
3885         pc = lookup_page_cgroup(page);
3886         /*
3887          * Every swap fault against a single page tries to charge the
3888          * page, bail as early as possible.  shmem_unuse() encounters
3889          * already charged pages, too.  The USED bit is protected by
3890          * the page lock, which serializes swap cache removal, which
3891          * in turn serializes uncharging.
3892          */
3893         if (PageCgroupUsed(pc))
3894                 return 0;
3895         if (!do_swap_account)
3896                 goto charge_cur_mm;
3897         memcg = try_get_mem_cgroup_from_page(page);
3898         if (!memcg)
3899                 goto charge_cur_mm;
3900         *memcgp = memcg;
3901         ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
3902         css_put(&memcg->css);
3903         if (ret == -EINTR)
3904                 ret = 0;
3905         return ret;
3906 charge_cur_mm:
3907         ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
3908         if (ret == -EINTR)
3909                 ret = 0;
3910         return ret;
3911 }
3912
3913 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
3914                                  gfp_t gfp_mask, struct mem_cgroup **memcgp)
3915 {
3916         *memcgp = NULL;
3917         if (mem_cgroup_disabled())
3918                 return 0;
3919         /*
3920          * A racing thread's fault, or swapoff, may have already
3921          * updated the pte, and even removed page from swap cache: in
3922          * those cases unuse_pte()'s pte_same() test will fail; but
3923          * there's also a KSM case which does need to charge the page.
3924          */
3925         if (!PageSwapCache(page)) {
3926                 int ret;
3927
3928                 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
3929                 if (ret == -EINTR)
3930                         ret = 0;
3931                 return ret;
3932         }
3933         return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
3934 }
3935
3936 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
3937 {
3938         if (mem_cgroup_disabled())
3939                 return;
3940         if (!memcg)
3941                 return;
3942         __mem_cgroup_cancel_charge(memcg, 1);
3943 }
3944
3945 static void
3946 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
3947                                         enum charge_type ctype)
3948 {
3949         if (mem_cgroup_disabled())
3950                 return;
3951         if (!memcg)
3952                 return;
3953
3954         __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
3955         /*
3956          * Now swap is on-memory. This means this page may be
3957          * counted both as mem and swap....double count.
3958          * Fix it by uncharging from memsw. Basically, this SwapCache is stable
3959          * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
3960          * may call delete_from_swap_cache() before reach here.
3961          */
3962         if (do_swap_account && PageSwapCache(page)) {
3963                 swp_entry_t ent = {.val = page_private(page)};
3964                 mem_cgroup_uncharge_swap(ent);
3965         }
3966 }
3967
3968 void mem_cgroup_commit_charge_swapin(struct page *page,
3969                                      struct mem_cgroup *memcg)
3970 {
3971         __mem_cgroup_commit_charge_swapin(page, memcg,
3972                                           MEM_CGROUP_CHARGE_TYPE_ANON);
3973 }
3974
3975 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
3976                                 gfp_t gfp_mask)
3977 {
3978         struct mem_cgroup *memcg = NULL;
3979         enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3980         int ret;
3981
3982         if (mem_cgroup_disabled())
3983                 return 0;
3984         if (PageCompound(page))
3985                 return 0;
3986
3987         if (!PageSwapCache(page))
3988                 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
3989         else { /* page is swapcache/shmem */
3990                 ret = __mem_cgroup_try_charge_swapin(mm, page,
3991                                                      gfp_mask, &memcg);
3992                 if (!ret)
3993                         __mem_cgroup_commit_charge_swapin(page, memcg, type);
3994         }
3995         return ret;
3996 }
3997
3998 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
3999                                    unsigned int nr_pages,
4000                                    const enum charge_type ctype)
4001 {
4002         struct memcg_batch_info *batch = NULL;
4003         bool uncharge_memsw = true;
4004
4005         /* If swapout, usage of swap doesn't decrease */
4006         if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
4007                 uncharge_memsw = false;
4008
4009         batch = &current->memcg_batch;
4010         /*
4011          * In usual, we do css_get() when we remember memcg pointer.
4012          * But in this case, we keep res->usage until end of a series of
4013          * uncharges. Then, it's ok to ignore memcg's refcnt.
4014          */
4015         if (!batch->memcg)
4016                 batch->memcg = memcg;
4017         /*
4018          * do_batch > 0 when unmapping pages or inode invalidate/truncate.
4019          * In those cases, all pages freed continuously can be expected to be in
4020          * the same cgroup and we have chance to coalesce uncharges.
4021          * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
4022          * because we want to do uncharge as soon as possible.
4023          */
4024
4025         if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
4026                 goto direct_uncharge;
4027
4028         if (nr_pages > 1)
4029                 goto direct_uncharge;
4030
4031         /*
4032          * In typical case, batch->memcg == mem. This means we can
4033          * merge a series of uncharges to an uncharge of res_counter.
4034          * If not, we uncharge res_counter ony by one.
4035          */
4036         if (batch->memcg != memcg)
4037                 goto direct_uncharge;
4038         /* remember freed charge and uncharge it later */
4039         batch->nr_pages++;
4040         if (uncharge_memsw)
4041                 batch->memsw_nr_pages++;
4042         return;
4043 direct_uncharge:
4044         res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
4045         if (uncharge_memsw)
4046                 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
4047         if (unlikely(batch->memcg != memcg))
4048                 memcg_oom_recover(memcg);
4049 }
4050
4051 /*
4052  * uncharge if !page_mapped(page)
4053  */
4054 static struct mem_cgroup *
4055 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
4056                              bool end_migration)
4057 {
4058         struct mem_cgroup *memcg = NULL;
4059         unsigned int nr_pages = 1;
4060         struct page_cgroup *pc;
4061         bool anon;
4062
4063         if (mem_cgroup_disabled())
4064                 return NULL;
4065
4066         if (PageTransHuge(page)) {
4067                 nr_pages <<= compound_order(page);
4068                 VM_BUG_ON(!PageTransHuge(page));
4069         }
4070         /*
4071          * Check if our page_cgroup is valid
4072          */
4073         pc = lookup_page_cgroup(page);
4074         if (unlikely(!PageCgroupUsed(pc)))
4075                 return NULL;
4076
4077         lock_page_cgroup(pc);
4078
4079         memcg = pc->mem_cgroup;
4080
4081         if (!PageCgroupUsed(pc))
4082                 goto unlock_out;
4083
4084         anon = PageAnon(page);
4085
4086         switch (ctype) {
4087         case MEM_CGROUP_CHARGE_TYPE_ANON:
4088                 /*
4089                  * Generally PageAnon tells if it's the anon statistics to be
4090                  * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
4091                  * used before page reached the stage of being marked PageAnon.
4092                  */
4093                 anon = true;
4094                 /* fallthrough */
4095         case MEM_CGROUP_CHARGE_TYPE_DROP:
4096                 /* See mem_cgroup_prepare_migration() */
4097                 if (page_mapped(page))
4098                         goto unlock_out;
4099                 /*
4100                  * Pages under migration may not be uncharged.  But
4101                  * end_migration() /must/ be the one uncharging the
4102                  * unused post-migration page and so it has to call
4103                  * here with the migration bit still set.  See the
4104                  * res_counter handling below.
4105                  */
4106                 if (!end_migration && PageCgroupMigration(pc))
4107                         goto unlock_out;
4108                 break;
4109         case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
4110                 if (!PageAnon(page)) {  /* Shared memory */
4111                         if (page->mapping && !page_is_file_cache(page))
4112                                 goto unlock_out;
4113                 } else if (page_mapped(page)) /* Anon */
4114                                 goto unlock_out;
4115                 break;
4116         default:
4117                 break;
4118         }
4119
4120         mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
4121
4122         ClearPageCgroupUsed(pc);
4123         /*
4124          * pc->mem_cgroup is not cleared here. It will be accessed when it's
4125          * freed from LRU. This is safe because uncharged page is expected not
4126          * to be reused (freed soon). Exception is SwapCache, it's handled by
4127          * special functions.
4128          */
4129
4130         unlock_page_cgroup(pc);
4131         /*
4132          * even after unlock, we have memcg->res.usage here and this memcg
4133          * will never be freed, so it's safe to call css_get().
4134          */
4135         memcg_check_events(memcg, page);
4136         if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
4137                 mem_cgroup_swap_statistics(memcg, true);
4138                 css_get(&memcg->css);
4139         }
4140         /*
4141          * Migration does not charge the res_counter for the
4142          * replacement page, so leave it alone when phasing out the
4143          * page that is unused after the migration.
4144          */
4145         if (!end_migration && !mem_cgroup_is_root(memcg))
4146                 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
4147
4148         return memcg;
4149
4150 unlock_out:
4151         unlock_page_cgroup(pc);
4152         return NULL;
4153 }
4154
4155 void mem_cgroup_uncharge_page(struct page *page)
4156 {
4157         /* early check. */
4158         if (page_mapped(page))
4159                 return;
4160         VM_BUG_ON(page->mapping && !PageAnon(page));
4161         /*
4162          * If the page is in swap cache, uncharge should be deferred
4163          * to the swap path, which also properly accounts swap usage
4164          * and handles memcg lifetime.
4165          *
4166          * Note that this check is not stable and reclaim may add the
4167          * page to swap cache at any time after this.  However, if the
4168          * page is not in swap cache by the time page->mapcount hits
4169          * 0, there won't be any page table references to the swap
4170          * slot, and reclaim will free it and not actually write the
4171          * page to disk.
4172          */
4173         if (PageSwapCache(page))
4174                 return;
4175         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
4176 }
4177
4178 void mem_cgroup_uncharge_cache_page(struct page *page)
4179 {
4180         VM_BUG_ON(page_mapped(page));
4181         VM_BUG_ON(page->mapping);
4182         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
4183 }
4184
4185 /*
4186  * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
4187  * In that cases, pages are freed continuously and we can expect pages
4188  * are in the same memcg. All these calls itself limits the number of
4189  * pages freed at once, then uncharge_start/end() is called properly.
4190  * This may be called prural(2) times in a context,
4191  */
4192
4193 void mem_cgroup_uncharge_start(void)
4194 {
4195         current->memcg_batch.do_batch++;
4196         /* We can do nest. */
4197         if (current->memcg_batch.do_batch == 1) {
4198                 current->memcg_batch.memcg = NULL;
4199                 current->memcg_batch.nr_pages = 0;
4200                 current->memcg_batch.memsw_nr_pages = 0;
4201         }
4202 }
4203
4204 void mem_cgroup_uncharge_end(void)
4205 {
4206         struct memcg_batch_info *batch = &current->memcg_batch;
4207
4208         if (!batch->do_batch)
4209                 return;
4210
4211         batch->do_batch--;
4212         if (batch->do_batch) /* If stacked, do nothing. */
4213                 return;
4214
4215         if (!batch->memcg)
4216                 return;
4217         /*
4218          * This "batch->memcg" is valid without any css_get/put etc...
4219          * bacause we hide charges behind us.
4220          */
4221         if (batch->nr_pages)
4222                 res_counter_uncharge(&batch->memcg->res,
4223                                      batch->nr_pages * PAGE_SIZE);
4224         if (batch->memsw_nr_pages)
4225                 res_counter_uncharge(&batch->memcg->memsw,
4226                                      batch->memsw_nr_pages * PAGE_SIZE);
4227         memcg_oom_recover(batch->memcg);
4228         /* forget this pointer (for sanity check) */
4229         batch->memcg = NULL;
4230 }
4231
4232 #ifdef CONFIG_SWAP
4233 /*
4234  * called after __delete_from_swap_cache() and drop "page" account.
4235  * memcg information is recorded to swap_cgroup of "ent"
4236  */
4237 void
4238 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
4239 {
4240         struct mem_cgroup *memcg;
4241         int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
4242
4243         if (!swapout) /* this was a swap cache but the swap is unused ! */
4244                 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
4245
4246         memcg = __mem_cgroup_uncharge_common(page, ctype, false);
4247
4248         /*
4249          * record memcg information,  if swapout && memcg != NULL,
4250          * css_get() was called in uncharge().
4251          */
4252         if (do_swap_account && swapout && memcg)
4253                 swap_cgroup_record(ent, mem_cgroup_id(memcg));
4254 }
4255 #endif
4256
4257 #ifdef CONFIG_MEMCG_SWAP
4258 /*
4259  * called from swap_entry_free(). remove record in swap_cgroup and
4260  * uncharge "memsw" account.
4261  */
4262 void mem_cgroup_uncharge_swap(swp_entry_t ent)
4263 {
4264         struct mem_cgroup *memcg;
4265         unsigned short id;
4266
4267         if (!do_swap_account)
4268                 return;
4269
4270         id = swap_cgroup_record(ent, 0);
4271         rcu_read_lock();
4272         memcg = mem_cgroup_lookup(id);
4273         if (memcg) {
4274                 /*
4275                  * We uncharge this because swap is freed.
4276                  * This memcg can be obsolete one. We avoid calling css_tryget
4277                  */
4278                 if (!mem_cgroup_is_root(memcg))
4279                         res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
4280                 mem_cgroup_swap_statistics(memcg, false);
4281                 css_put(&memcg->css);
4282         }
4283         rcu_read_unlock();
4284 }
4285
4286 /**
4287  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
4288  * @entry: swap entry to be moved
4289  * @from:  mem_cgroup which the entry is moved from
4290  * @to:  mem_cgroup which the entry is moved to
4291  *
4292  * It succeeds only when the swap_cgroup's record for this entry is the same
4293  * as the mem_cgroup's id of @from.
4294  *
4295  * Returns 0 on success, -EINVAL on failure.
4296  *
4297  * The caller must have charged to @to, IOW, called res_counter_charge() about
4298  * both res and memsw, and called css_get().
4299  */
4300 static int mem_cgroup_move_swap_account(swp_entry_t entry,
4301                                 struct mem_cgroup *from, struct mem_cgroup *to)
4302 {
4303         unsigned short old_id, new_id;
4304
4305         old_id = mem_cgroup_id(from);
4306         new_id = mem_cgroup_id(to);
4307
4308         if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
4309                 mem_cgroup_swap_statistics(from, false);
4310                 mem_cgroup_swap_statistics(to, true);
4311                 /*
4312                  * This function is only called from task migration context now.
4313                  * It postpones res_counter and refcount handling till the end
4314                  * of task migration(mem_cgroup_clear_mc()) for performance
4315                  * improvement. But we cannot postpone css_get(to)  because if
4316                  * the process that has been moved to @to does swap-in, the
4317                  * refcount of @to might be decreased to 0.
4318                  *
4319                  * We are in attach() phase, so the cgroup is guaranteed to be
4320                  * alive, so we can just call css_get().
4321                  */
4322                 css_get(&to->css);
4323                 return 0;
4324         }
4325         return -EINVAL;
4326 }
4327 #else
4328 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
4329                                 struct mem_cgroup *from, struct mem_cgroup *to)
4330 {
4331         return -EINVAL;
4332 }
4333 #endif
4334
4335 /*
4336  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
4337  * page belongs to.
4338  */
4339 void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
4340                                   struct mem_cgroup **memcgp)
4341 {
4342         struct mem_cgroup *memcg = NULL;
4343         unsigned int nr_pages = 1;
4344         struct page_cgroup *pc;
4345         enum charge_type ctype;
4346
4347         *memcgp = NULL;
4348
4349         if (mem_cgroup_disabled())
4350                 return;
4351
4352         if (PageTransHuge(page))
4353                 nr_pages <<= compound_order(page);
4354
4355         pc = lookup_page_cgroup(page);
4356         lock_page_cgroup(pc);
4357         if (PageCgroupUsed(pc)) {
4358                 memcg = pc->mem_cgroup;
4359                 css_get(&memcg->css);
4360                 /*
4361                  * At migrating an anonymous page, its mapcount goes down
4362                  * to 0 and uncharge() will be called. But, even if it's fully
4363                  * unmapped, migration may fail and this page has to be
4364                  * charged again. We set MIGRATION flag here and delay uncharge
4365                  * until end_migration() is called
4366                  *
4367                  * Corner Case Thinking
4368                  * A)
4369                  * When the old page was mapped as Anon and it's unmap-and-freed
4370                  * while migration was ongoing.
4371                  * If unmap finds the old page, uncharge() of it will be delayed
4372                  * until end_migration(). If unmap finds a new page, it's
4373                  * uncharged when it make mapcount to be 1->0. If unmap code
4374                  * finds swap_migration_entry, the new page will not be mapped
4375                  * and end_migration() will find it(mapcount==0).
4376                  *
4377                  * B)
4378                  * When the old page was mapped but migraion fails, the kernel
4379                  * remaps it. A charge for it is kept by MIGRATION flag even
4380                  * if mapcount goes down to 0. We can do remap successfully
4381                  * without charging it again.
4382                  *
4383                  * C)
4384                  * The "old" page is under lock_page() until the end of
4385                  * migration, so, the old page itself will not be swapped-out.
4386                  * If the new page is swapped out before end_migraton, our
4387                  * hook to usual swap-out path will catch the event.
4388                  */
4389                 if (PageAnon(page))
4390                         SetPageCgroupMigration(pc);
4391         }
4392         unlock_page_cgroup(pc);
4393         /*
4394          * If the page is not charged at this point,
4395          * we return here.
4396          */
4397         if (!memcg)
4398                 return;
4399
4400         *memcgp = memcg;
4401         /*
4402          * We charge new page before it's used/mapped. So, even if unlock_page()
4403          * is called before end_migration, we can catch all events on this new
4404          * page. In the case new page is migrated but not remapped, new page's
4405          * mapcount will be finally 0 and we call uncharge in end_migration().
4406          */
4407         if (PageAnon(page))
4408                 ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
4409         else
4410                 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
4411         /*
4412          * The page is committed to the memcg, but it's not actually
4413          * charged to the res_counter since we plan on replacing the
4414          * old one and only one page is going to be left afterwards.
4415          */
4416         __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
4417 }
4418
4419 /* remove redundant charge if migration failed*/
4420 void mem_cgroup_end_migration(struct mem_cgroup *memcg,
4421         struct page *oldpage, struct page *newpage, bool migration_ok)
4422 {
4423         struct page *used, *unused;
4424         struct page_cgroup *pc;
4425         bool anon;
4426
4427         if (!memcg)
4428                 return;
4429
4430         if (!migration_ok) {
4431                 used = oldpage;
4432                 unused = newpage;
4433         } else {
4434                 used = newpage;
4435                 unused = oldpage;
4436         }
4437         anon = PageAnon(used);
4438         __mem_cgroup_uncharge_common(unused,
4439                                      anon ? MEM_CGROUP_CHARGE_TYPE_ANON
4440                                      : MEM_CGROUP_CHARGE_TYPE_CACHE,
4441                                      true);
4442         css_put(&memcg->css);
4443         /*
4444          * We disallowed uncharge of pages under migration because mapcount
4445          * of the page goes down to zero, temporarly.
4446          * Clear the flag and check the page should be charged.
4447          */
4448         pc = lookup_page_cgroup(oldpage);
4449         lock_page_cgroup(pc);
4450         ClearPageCgroupMigration(pc);
4451         unlock_page_cgroup(pc);
4452
4453         /*
4454          * If a page is a file cache, radix-tree replacement is very atomic
4455          * and we can skip this check. When it was an Anon page, its mapcount
4456          * goes down to 0. But because we added MIGRATION flage, it's not
4457          * uncharged yet. There are several case but page->mapcount check
4458          * and USED bit check in mem_cgroup_uncharge_page() will do enough
4459          * check. (see prepare_charge() also)
4460          */
4461         if (anon)
4462                 mem_cgroup_uncharge_page(used);
4463 }
4464
4465 /*
4466  * At replace page cache, newpage is not under any memcg but it's on
4467  * LRU. So, this function doesn't touch res_counter but handles LRU
4468  * in correct way. Both pages are locked so we cannot race with uncharge.
4469  */
4470 void mem_cgroup_replace_page_cache(struct page *oldpage,
4471                                   struct page *newpage)
4472 {
4473         struct mem_cgroup *memcg = NULL;
4474         struct page_cgroup *pc;
4475         enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
4476
4477         if (mem_cgroup_disabled())
4478                 return;
4479
4480         pc = lookup_page_cgroup(oldpage);
4481         /* fix accounting on old pages */
4482         lock_page_cgroup(pc);
4483         if (PageCgroupUsed(pc)) {
4484                 memcg = pc->mem_cgroup;
4485                 mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
4486                 ClearPageCgroupUsed(pc);
4487         }
4488         unlock_page_cgroup(pc);
4489
4490         /*
4491          * When called from shmem_replace_page(), in some cases the
4492          * oldpage has already been charged, and in some cases not.
4493          */
4494         if (!memcg)
4495                 return;
4496         /*
4497          * Even if newpage->mapping was NULL before starting replacement,
4498          * the newpage may be on LRU(or pagevec for LRU) already. We lock
4499          * LRU while we overwrite pc->mem_cgroup.
4500          */
4501         __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
4502 }
4503
4504 #ifdef CONFIG_DEBUG_VM
4505 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
4506 {
4507         struct page_cgroup *pc;
4508
4509         pc = lookup_page_cgroup(page);
4510         /*
4511          * Can be NULL while feeding pages into the page allocator for
4512          * the first time, i.e. during boot or memory hotplug;
4513          * or when mem_cgroup_disabled().
4514          */
4515         if (likely(pc) && PageCgroupUsed(pc))
4516                 return pc;
4517         return NULL;
4518 }
4519
4520 bool mem_cgroup_bad_page_check(struct page *page)
4521 {
4522         if (mem_cgroup_disabled())
4523                 return false;
4524
4525         return lookup_page_cgroup_used(page) != NULL;
4526 }
4527
4528 void mem_cgroup_print_bad_page(struct page *page)
4529 {
4530         struct page_cgroup *pc;
4531
4532         pc = lookup_page_cgroup_used(page);
4533         if (pc) {
4534                 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
4535                          pc, pc->flags, pc->mem_cgroup);
4536         }
4537 }
4538 #endif
4539
4540 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
4541                                 unsigned long long val)
4542 {
4543         int retry_count;
4544         u64 memswlimit, memlimit;
4545         int ret = 0;
4546         int children = mem_cgroup_count_children(memcg);
4547         u64 curusage, oldusage;
4548         int enlarge;
4549
4550         /*
4551          * For keeping hierarchical_reclaim simple, how long we should retry
4552          * is depends on callers. We set our retry-count to be function
4553          * of # of children which we should visit in this loop.
4554          */
4555         retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
4556
4557         oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
4558
4559         enlarge = 0;
4560         while (retry_count) {
4561                 if (signal_pending(current)) {
4562                         ret = -EINTR;
4563                         break;
4564                 }
4565                 /*
4566                  * Rather than hide all in some function, I do this in
4567                  * open coded manner. You see what this really does.
4568                  * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
4569                  */
4570                 mutex_lock(&set_limit_mutex);
4571                 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4572                 if (memswlimit < val) {
4573                         ret = -EINVAL;
4574                         mutex_unlock(&set_limit_mutex);
4575                         break;
4576                 }
4577
4578                 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4579                 if (memlimit < val)
4580                         enlarge = 1;
4581
4582                 ret = res_counter_set_limit(&memcg->res, val);
4583                 if (!ret) {
4584                         if (memswlimit == val)
4585                                 memcg->memsw_is_minimum = true;
4586                         else
4587                                 memcg->memsw_is_minimum = false;
4588                 }
4589                 mutex_unlock(&set_limit_mutex);
4590
4591                 if (!ret)
4592                         break;
4593
4594                 mem_cgroup_reclaim(memcg, GFP_KERNEL,
4595                                    MEM_CGROUP_RECLAIM_SHRINK);
4596                 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
4597                 /* Usage is reduced ? */
4598                 if (curusage >= oldusage)
4599                         retry_count--;
4600                 else
4601                         oldusage = curusage;
4602         }
4603         if (!ret && enlarge)
4604                 memcg_oom_recover(memcg);
4605
4606         return ret;
4607 }
4608
4609 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
4610                                         unsigned long long val)
4611 {
4612         int retry_count;
4613         u64 memlimit, memswlimit, oldusage, curusage;
4614         int children = mem_cgroup_count_children(memcg);
4615         int ret = -EBUSY;
4616         int enlarge = 0;
4617
4618         /* see mem_cgroup_resize_res_limit */
4619         retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
4620         oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4621         while (retry_count) {
4622                 if (signal_pending(current)) {
4623                         ret = -EINTR;
4624                         break;
4625                 }
4626                 /*
4627                  * Rather than hide all in some function, I do this in
4628                  * open coded manner. You see what this really does.
4629                  * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
4630                  */
4631                 mutex_lock(&set_limit_mutex);
4632                 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4633                 if (memlimit > val) {
4634                         ret = -EINVAL;
4635                         mutex_unlock(&set_limit_mutex);
4636                         break;
4637                 }
4638                 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4639                 if (memswlimit < val)
4640                         enlarge = 1;
4641                 ret = res_counter_set_limit(&memcg->memsw, val);
4642                 if (!ret) {
4643                         if (memlimit == val)
4644                                 memcg->memsw_is_minimum = true;
4645                         else
4646                                 memcg->memsw_is_minimum = false;
4647                 }
4648                 mutex_unlock(&set_limit_mutex);
4649
4650                 if (!ret)
4651                         break;
4652
4653                 mem_cgroup_reclaim(memcg, GFP_KERNEL,
4654                                    MEM_CGROUP_RECLAIM_NOSWAP |
4655                                    MEM_CGROUP_RECLAIM_SHRINK);
4656                 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4657                 /* Usage is reduced ? */
4658                 if (curusage >= oldusage)
4659                         retry_count--;
4660                 else
4661                         oldusage = curusage;
4662         }
4663         if (!ret && enlarge)
4664                 memcg_oom_recover(memcg);
4665         return ret;
4666 }
4667
4668 /**
4669  * mem_cgroup_force_empty_list - clears LRU of a group
4670  * @memcg: group to clear
4671  * @node: NUMA node
4672  * @zid: zone id
4673  * @lru: lru to to clear
4674  *
4675  * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
4676  * reclaim the pages page themselves - pages are moved to the parent (or root)
4677  * group.
4678  */
4679 static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
4680                                 int node, int zid, enum lru_list lru)
4681 {
4682         struct lruvec *lruvec;
4683         unsigned long flags;
4684         struct list_head *list;
4685         struct page *busy;
4686         struct zone *zone;
4687
4688         zone = &NODE_DATA(node)->node_zones[zid];
4689         lruvec = mem_cgroup_zone_lruvec(zone, memcg);
4690         list = &lruvec->lists[lru];
4691
4692         busy = NULL;
4693         do {
4694                 struct page_cgroup *pc;
4695                 struct page *page;
4696
4697                 spin_lock_irqsave(&zone->lru_lock, flags);
4698                 if (list_empty(list)) {
4699                         spin_unlock_irqrestore(&zone->lru_lock, flags);
4700                         break;
4701                 }
4702                 page = list_entry(list->prev, struct page, lru);
4703                 if (busy == page) {
4704                         list_move(&page->lru, list);
4705                         busy = NULL;
4706                         spin_unlock_irqrestore(&zone->lru_lock, flags);
4707                         continue;
4708                 }
4709                 spin_unlock_irqrestore(&zone->lru_lock, flags);
4710
4711                 pc = lookup_page_cgroup(page);
4712
4713                 if (mem_cgroup_move_parent(page, pc, memcg)) {
4714                         /* found lock contention or "pc" is obsolete. */
4715                         busy = page;
4716                         cond_resched();
4717                 } else
4718                         busy = NULL;
4719         } while (!list_empty(list));
4720 }
4721
4722 /*
4723  * make mem_cgroup's charge to be 0 if there is no task by moving
4724  * all the charges and pages to the parent.
4725  * This enables deleting this mem_cgroup.
4726  *
4727  * Caller is responsible for holding css reference on the memcg.
4728  */
4729 static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
4730 {
4731         int node, zid;
4732         u64 usage;
4733
4734         do {
4735                 /* This is for making all *used* pages to be on LRU. */
4736                 lru_add_drain_all();
4737                 drain_all_stock_sync(memcg);
4738                 mem_cgroup_start_move(memcg);
4739                 for_each_node_state(node, N_MEMORY) {
4740                         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4741                                 enum lru_list lru;
4742                                 for_each_lru(lru) {
4743                                         mem_cgroup_force_empty_list(memcg,
4744                                                         node, zid, lru);
4745                                 }
4746                         }
4747                 }
4748                 mem_cgroup_end_move(memcg);
4749                 memcg_oom_recover(memcg);
4750                 cond_resched();
4751
4752                 /*
4753                  * Kernel memory may not necessarily be trackable to a specific
4754                  * process. So they are not migrated, and therefore we can't
4755                  * expect their value to drop to 0 here.
4756                  * Having res filled up with kmem only is enough.
4757                  *
4758                  * This is a safety check because mem_cgroup_force_empty_list
4759                  * could have raced with mem_cgroup_replace_page_cache callers
4760                  * so the lru seemed empty but the page could have been added
4761                  * right after the check. RES_USAGE should be safe as we always
4762                  * charge before adding to the LRU.
4763                  */
4764                 usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
4765                         res_counter_read_u64(&memcg->kmem, RES_USAGE);
4766         } while (usage > 0);
4767 }
4768
4769 /*
4770  * This mainly exists for tests during the setting of set of use_hierarchy.
4771  * Since this is the very setting we are changing, the current hierarchy value
4772  * is meaningless
4773  */
4774 static inline bool __memcg_has_children(struct mem_cgroup *memcg)
4775 {
4776         struct cgroup_subsys_state *pos;
4777
4778         /* bounce at first found */
4779         css_for_each_child(pos, &memcg->css)
4780                 return true;
4781         return false;
4782 }
4783
4784 /*
4785  * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
4786  * to be already dead (as in mem_cgroup_force_empty, for instance).  This is
4787  * from mem_cgroup_count_children(), in the sense that we don't really care how
4788  * many children we have; we only need to know if we have any.  It also counts
4789  * any memcg without hierarchy as infertile.
4790  */
4791 static inline bool memcg_has_children(struct mem_cgroup *memcg)
4792 {
4793         return memcg->use_hierarchy && __memcg_has_children(memcg);
4794 }
4795
4796 /*
4797  * Reclaims as many pages from the given memcg as possible and moves
4798  * the rest to the parent.
4799  *
4800  * Caller is responsible for holding css reference for memcg.
4801  */
4802 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4803 {
4804         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
4805         struct cgroup *cgrp = memcg->css.cgroup;
4806
4807         /* returns EBUSY if there is a task or if we come here twice. */
4808         if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
4809                 return -EBUSY;
4810
4811         /* we call try-to-free pages for make this cgroup empty */
4812         lru_add_drain_all();
4813         /* try to free all pages in this cgroup */
4814         while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
4815                 int progress;
4816
4817                 if (signal_pending(current))
4818                         return -EINTR;
4819
4820                 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
4821                                                 false);
4822                 if (!progress) {
4823                         nr_retries--;
4824                         /* maybe some writeback is necessary */
4825                         congestion_wait(BLK_RW_ASYNC, HZ/10);
4826                 }
4827
4828         }
4829         lru_add_drain();
4830         mem_cgroup_reparent_charges(memcg);
4831
4832         return 0;
4833 }
4834
4835 static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,
4836                                         unsigned int event)
4837 {
4838         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4839
4840         if (mem_cgroup_is_root(memcg))
4841                 return -EINVAL;
4842         return mem_cgroup_force_empty(memcg);
4843 }
4844
4845 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
4846                                      struct cftype *cft)
4847 {
4848         return mem_cgroup_from_css(css)->use_hierarchy;
4849 }
4850
4851 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
4852                                       struct cftype *cft, u64 val)
4853 {
4854         int retval = 0;
4855         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4856         struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css));
4857
4858         mutex_lock(&memcg_create_mutex);
4859
4860         if (memcg->use_hierarchy == val)
4861                 goto out;
4862
4863         /*
4864          * If parent's use_hierarchy is set, we can't make any modifications
4865          * in the child subtrees. If it is unset, then the change can
4866          * occur, provided the current cgroup has no children.
4867          *
4868          * For the root cgroup, parent_mem is NULL, we allow value to be
4869          * set if there are no children.
4870          */
4871         if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
4872                                 (val == 1 || val == 0)) {
4873                 if (!__memcg_has_children(memcg))
4874                         memcg->use_hierarchy = val;
4875                 else
4876                         retval = -EBUSY;
4877         } else
4878                 retval = -EINVAL;
4879
4880 out:
4881         mutex_unlock(&memcg_create_mutex);
4882
4883         return retval;
4884 }
4885
4886
4887 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
4888                                                enum mem_cgroup_stat_index idx)
4889 {
4890         struct mem_cgroup *iter;
4891         long val = 0;
4892
4893         /* Per-cpu values can be negative, use a signed accumulator */
4894         for_each_mem_cgroup_tree(iter, memcg)
4895                 val += mem_cgroup_read_stat(iter, idx);
4896
4897         if (val < 0) /* race ? */
4898                 val = 0;
4899         return val;
4900 }
4901
4902 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
4903 {
4904         u64 val;
4905
4906         if (!mem_cgroup_is_root(memcg)) {
4907                 if (!swap)
4908                         return res_counter_read_u64(&memcg->res, RES_USAGE);
4909                 else
4910                         return res_counter_read_u64(&memcg->memsw, RES_USAGE);
4911         }
4912
4913         /*
4914          * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
4915          * as well as in MEM_CGROUP_STAT_RSS_HUGE.
4916          */
4917         val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
4918         val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
4919
4920         if (swap)
4921                 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
4922
4923         return val << PAGE_SHIFT;
4924 }
4925
4926 static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
4927                                struct cftype *cft, struct file *file,
4928                                char __user *buf, size_t nbytes, loff_t *ppos)
4929 {
4930         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4931         char str[64];
4932         u64 val;
4933         int name, len;
4934         enum res_type type;
4935
4936         type = MEMFILE_TYPE(cft->private);
4937         name = MEMFILE_ATTR(cft->private);
4938
4939         switch (type) {
4940         case _MEM:
4941                 if (name == RES_USAGE)
4942                         val = mem_cgroup_usage(memcg, false);
4943                 else
4944                         val = res_counter_read_u64(&memcg->res, name);
4945                 break;
4946         case _MEMSWAP:
4947                 if (name == RES_USAGE)
4948                         val = mem_cgroup_usage(memcg, true);
4949                 else
4950                         val = res_counter_read_u64(&memcg->memsw, name);
4951                 break;
4952         case _KMEM:
4953                 val = res_counter_read_u64(&memcg->kmem, name);
4954                 break;
4955         default:
4956                 BUG();
4957         }
4958
4959         len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
4960         return simple_read_from_buffer(buf, nbytes, ppos, str, len);
4961 }
4962
4963 static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
4964 {
4965         int ret = -EINVAL;
4966 #ifdef CONFIG_MEMCG_KMEM
4967         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4968         /*
4969          * For simplicity, we won't allow this to be disabled.  It also can't
4970          * be changed if the cgroup has children already, or if tasks had
4971          * already joined.
4972          *
4973          * If tasks join before we set the limit, a person looking at
4974          * kmem.usage_in_bytes will have no way to determine when it took
4975          * place, which makes the value quite meaningless.
4976          *
4977          * After it first became limited, changes in the value of the limit are
4978          * of course permitted.
4979          */
4980         mutex_lock(&memcg_create_mutex);
4981         mutex_lock(&set_limit_mutex);
4982         if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
4983                 if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
4984                         ret = -EBUSY;
4985                         goto out;
4986                 }
4987                 ret = res_counter_set_limit(&memcg->kmem, val);
4988                 VM_BUG_ON(ret);
4989
4990                 ret = memcg_update_cache_sizes(memcg);
4991                 if (ret) {
4992                         res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
4993                         goto out;
4994                 }
4995                 static_key_slow_inc(&memcg_kmem_enabled_key);
4996                 /*
4997                  * setting the active bit after the inc will guarantee no one
4998                  * starts accounting before all call sites are patched
4999                  */
5000                 memcg_kmem_set_active(memcg);
5001         } else
5002                 ret = res_counter_set_limit(&memcg->kmem, val);
5003 out:
5004         mutex_unlock(&set_limit_mutex);
5005         mutex_unlock(&memcg_create_mutex);
5006 #endif
5007         return ret;
5008 }
5009
5010 #ifdef CONFIG_MEMCG_KMEM
5011 static int memcg_propagate_kmem(struct mem_cgroup *memcg)
5012 {
5013         int ret = 0;
5014         struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5015         if (!parent)
5016                 goto out;
5017
5018         memcg->kmem_account_flags = parent->kmem_account_flags;
5019         /*
5020          * When that happen, we need to disable the static branch only on those
5021          * memcgs that enabled it. To achieve this, we would be forced to
5022          * complicate the code by keeping track of which memcgs were the ones
5023          * that actually enabled limits, and which ones got it from its
5024          * parents.
5025          *
5026          * It is a lot simpler just to do static_key_slow_inc() on every child
5027          * that is accounted.
5028          */
5029         if (!memcg_kmem_is_active(memcg))
5030                 goto out;
5031
5032         /*
5033          * __mem_cgroup_free() will issue static_key_slow_dec() because this
5034          * memcg is active already. If the later initialization fails then the
5035          * cgroup core triggers the cleanup so we do not have to do it here.
5036          */
5037         static_key_slow_inc(&memcg_kmem_enabled_key);
5038
5039         mutex_lock(&set_limit_mutex);
5040         memcg_stop_kmem_account();
5041         ret = memcg_update_cache_sizes(memcg);
5042         memcg_resume_kmem_account();
5043         mutex_unlock(&set_limit_mutex);
5044 out:
5045         return ret;
5046 }
5047 #endif /* CONFIG_MEMCG_KMEM */
5048
5049 /*
5050  * The user of this function is...
5051  * RES_LIMIT.
5052  */
5053 static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
5054                             const char *buffer)
5055 {
5056         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5057         enum res_type type;
5058         int name;
5059         unsigned long long val;
5060         int ret;
5061
5062         type = MEMFILE_TYPE(cft->private);
5063         name = MEMFILE_ATTR(cft->private);
5064
5065         switch (name) {
5066         case RES_LIMIT:
5067                 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
5068                         ret = -EINVAL;
5069                         break;
5070                 }
5071                 /* This function does all necessary parse...reuse it */
5072                 ret = res_counter_memparse_write_strategy(buffer, &val);
5073                 if (ret)
5074                         break;
5075                 if (type == _MEM)
5076                         ret = mem_cgroup_resize_limit(memcg, val);
5077                 else if (type == _MEMSWAP)
5078                         ret = mem_cgroup_resize_memsw_limit(memcg, val);
5079                 else if (type == _KMEM)
5080                         ret = memcg_update_kmem_limit(css, val);
5081                 else
5082                         return -EINVAL;
5083                 break;
5084         case RES_SOFT_LIMIT:
5085                 ret = res_counter_memparse_write_strategy(buffer, &val);
5086                 if (ret)
5087                         break;
5088                 /*
5089                  * For memsw, soft limits are hard to implement in terms
5090                  * of semantics, for now, we support soft limits for
5091                  * control without swap
5092                  */
5093                 if (type == _MEM)
5094                         ret = res_counter_set_soft_limit(&memcg->res, val);
5095                 else
5096                         ret = -EINVAL;
5097                 break;
5098         default:
5099                 ret = -EINVAL; /* should be BUG() ? */
5100                 break;
5101         }
5102         return ret;
5103 }
5104
5105 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
5106                 unsigned long long *mem_limit, unsigned long long *memsw_limit)
5107 {
5108         unsigned long long min_limit, min_memsw_limit, tmp;
5109
5110         min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
5111         min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
5112         if (!memcg->use_hierarchy)
5113                 goto out;
5114
5115         while (css_parent(&memcg->css)) {
5116                 memcg = mem_cgroup_from_css(css_parent(&memcg->css));
5117                 if (!memcg->use_hierarchy)
5118                         break;
5119                 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
5120                 min_limit = min(min_limit, tmp);
5121                 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
5122                 min_memsw_limit = min(min_memsw_limit, tmp);
5123         }
5124 out:
5125         *mem_limit = min_limit;
5126         *memsw_limit = min_memsw_limit;
5127 }
5128
5129 static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
5130 {
5131         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5132         int name;
5133         enum res_type type;
5134
5135         type = MEMFILE_TYPE(event);
5136         name = MEMFILE_ATTR(event);
5137
5138         switch (name) {
5139         case RES_MAX_USAGE:
5140                 if (type == _MEM)
5141                         res_counter_reset_max(&memcg->res);
5142                 else if (type == _MEMSWAP)
5143                         res_counter_reset_max(&memcg->memsw);
5144                 else if (type == _KMEM)
5145                         res_counter_reset_max(&memcg->kmem);
5146                 else
5147                         return -EINVAL;
5148                 break;
5149         case RES_FAILCNT:
5150                 if (type == _MEM)
5151                         res_counter_reset_failcnt(&memcg->res);
5152                 else if (type == _MEMSWAP)
5153                         res_counter_reset_failcnt(&memcg->memsw);
5154                 else if (type == _KMEM)
5155                         res_counter_reset_failcnt(&memcg->kmem);
5156                 else
5157                         return -EINVAL;
5158                 break;
5159         }
5160
5161         return 0;
5162 }
5163
5164 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
5165                                         struct cftype *cft)
5166 {
5167         return mem_cgroup_from_css(css)->move_charge_at_immigrate;
5168 }
5169
5170 #ifdef CONFIG_MMU
5171 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5172                                         struct cftype *cft, u64 val)
5173 {
5174         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5175
5176         if (val >= (1 << NR_MOVE_TYPE))
5177                 return -EINVAL;
5178
5179         /*
5180          * No kind of locking is needed in here, because ->can_attach() will
5181          * check this value once in the beginning of the process, and then carry
5182          * on with stale data. This means that changes to this value will only
5183          * affect task migrations starting after the change.
5184          */
5185         memcg->move_charge_at_immigrate = val;
5186         return 0;
5187 }
5188 #else
5189 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5190                                         struct cftype *cft, u64 val)
5191 {
5192         return -ENOSYS;
5193 }
5194 #endif
5195
5196 #ifdef CONFIG_NUMA
5197 static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
5198                                 struct cftype *cft, struct seq_file *m)
5199 {
5200         int nid;
5201         unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
5202         unsigned long node_nr;
5203         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5204
5205         total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
5206         seq_printf(m, "total=%lu", total_nr);
5207         for_each_node_state(nid, N_MEMORY) {
5208                 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
5209                 seq_printf(m, " N%d=%lu", nid, node_nr);
5210         }
5211         seq_putc(m, '\n');
5212
5213         file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
5214         seq_printf(m, "file=%lu", file_nr);
5215         for_each_node_state(nid, N_MEMORY) {
5216                 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5217                                 LRU_ALL_FILE);
5218                 seq_printf(m, " N%d=%lu", nid, node_nr);
5219         }
5220         seq_putc(m, '\n');
5221
5222         anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
5223         seq_printf(m, "anon=%lu", anon_nr);
5224         for_each_node_state(nid, N_MEMORY) {
5225                 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5226                                 LRU_ALL_ANON);
5227                 seq_printf(m, " N%d=%lu", nid, node_nr);
5228         }
5229         seq_putc(m, '\n');
5230
5231         unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
5232         seq_printf(m, "unevictable=%lu", unevictable_nr);
5233         for_each_node_state(nid, N_MEMORY) {
5234                 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5235                                 BIT(LRU_UNEVICTABLE));
5236                 seq_printf(m, " N%d=%lu", nid, node_nr);
5237         }
5238         seq_putc(m, '\n');
5239         return 0;
5240 }
5241 #endif /* CONFIG_NUMA */
5242
5243 static inline void mem_cgroup_lru_names_not_uptodate(void)
5244 {
5245         BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
5246 }
5247
5248 static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft,
5249                                  struct seq_file *m)
5250 {
5251         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5252         struct mem_cgroup *mi;
5253         unsigned int i;
5254
5255         for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
5256                 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
5257                         continue;
5258                 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
5259                            mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
5260         }
5261
5262         for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
5263                 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
5264                            mem_cgroup_read_events(memcg, i));
5265
5266         for (i = 0; i < NR_LRU_LISTS; i++)
5267                 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
5268                            mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
5269
5270         /* Hierarchical information */
5271         {
5272                 unsigned long long limit, memsw_limit;
5273                 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
5274                 seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
5275                 if (do_swap_account)
5276                         seq_printf(m, "hierarchical_memsw_limit %llu\n",
5277                                    memsw_limit);
5278         }
5279
5280         for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
5281                 long long val = 0;
5282
5283                 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
5284                         continue;
5285                 for_each_mem_cgroup_tree(mi, memcg)
5286                         val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
5287                 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
5288         }
5289
5290         for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
5291                 unsigned long long val = 0;
5292
5293                 for_each_mem_cgroup_tree(mi, memcg)
5294                         val += mem_cgroup_read_events(mi, i);
5295                 seq_printf(m, "total_%s %llu\n",
5296                            mem_cgroup_events_names[i], val);
5297         }
5298
5299         for (i = 0; i < NR_LRU_LISTS; i++) {
5300                 unsigned long long val = 0;
5301
5302                 for_each_mem_cgroup_tree(mi, memcg)
5303                         val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
5304                 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
5305         }
5306
5307 #ifdef CONFIG_DEBUG_VM
5308         {
5309                 int nid, zid;
5310                 struct mem_cgroup_per_zone *mz;
5311                 struct zone_reclaim_stat *rstat;
5312                 unsigned long recent_rotated[2] = {0, 0};
5313                 unsigned long recent_scanned[2] = {0, 0};
5314
5315                 for_each_online_node(nid)
5316                         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
5317                                 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
5318                                 rstat = &mz->lruvec.reclaim_stat;
5319
5320                                 recent_rotated[0] += rstat->recent_rotated[0];
5321                                 recent_rotated[1] += rstat->recent_rotated[1];
5322                                 recent_scanned[0] += rstat->recent_scanned[0];
5323                                 recent_scanned[1] += rstat->recent_scanned[1];
5324                         }
5325                 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
5326                 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
5327                 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
5328                 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
5329         }
5330 #endif
5331
5332         return 0;
5333 }
5334
5335 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
5336                                       struct cftype *cft)
5337 {
5338         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5339
5340         return mem_cgroup_swappiness(memcg);
5341 }
5342
5343 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
5344                                        struct cftype *cft, u64 val)
5345 {
5346         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5347         struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5348
5349         if (val > 100 || !parent)
5350                 return -EINVAL;
5351
5352         mutex_lock(&memcg_create_mutex);
5353
5354         /* If under hierarchy, only empty-root can set this value */
5355         if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5356                 mutex_unlock(&memcg_create_mutex);
5357                 return -EINVAL;
5358         }
5359
5360         memcg->swappiness = val;
5361
5362         mutex_unlock(&memcg_create_mutex);
5363
5364         return 0;
5365 }
5366
5367 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
5368 {
5369         struct mem_cgroup_threshold_ary *t;
5370         u64 usage;
5371         int i;
5372
5373         rcu_read_lock();
5374         if (!swap)
5375                 t = rcu_dereference(memcg->thresholds.primary);
5376         else
5377                 t = rcu_dereference(memcg->memsw_thresholds.primary);
5378
5379         if (!t)
5380                 goto unlock;
5381
5382         usage = mem_cgroup_usage(memcg, swap);
5383
5384         /*
5385          * current_threshold points to threshold just below or equal to usage.
5386          * If it's not true, a threshold was crossed after last
5387          * call of __mem_cgroup_threshold().
5388          */
5389         i = t->current_threshold;
5390
5391         /*
5392          * Iterate backward over array of thresholds starting from
5393          * current_threshold and check if a threshold is crossed.
5394          * If none of thresholds below usage is crossed, we read
5395          * only one element of the array here.
5396          */
5397         for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
5398                 eventfd_signal(t->entries[i].eventfd, 1);
5399
5400         /* i = current_threshold + 1 */
5401         i++;
5402
5403         /*
5404          * Iterate forward over array of thresholds starting from
5405          * current_threshold+1 and check if a threshold is crossed.
5406          * If none of thresholds above usage is crossed, we read
5407          * only one element of the array here.
5408          */
5409         for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
5410                 eventfd_signal(t->entries[i].eventfd, 1);
5411
5412         /* Update current_threshold */
5413         t->current_threshold = i - 1;
5414 unlock:
5415         rcu_read_unlock();
5416 }
5417
5418 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
5419 {
5420         while (memcg) {
5421                 __mem_cgroup_threshold(memcg, false);
5422                 if (do_swap_account)
5423                         __mem_cgroup_threshold(memcg, true);
5424
5425                 memcg = parent_mem_cgroup(memcg);
5426         }
5427 }
5428
5429 static int compare_thresholds(const void *a, const void *b)
5430 {
5431         const struct mem_cgroup_threshold *_a = a;
5432         const struct mem_cgroup_threshold *_b = b;
5433
5434         if (_a->threshold > _b->threshold)
5435                 return 1;
5436
5437         if (_a->threshold < _b->threshold)
5438                 return -1;
5439
5440         return 0;
5441 }
5442
5443 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
5444 {
5445         struct mem_cgroup_eventfd_list *ev;
5446
5447         list_for_each_entry(ev, &memcg->oom_notify, list)
5448                 eventfd_signal(ev->eventfd, 1);
5449         return 0;
5450 }
5451
5452 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
5453 {
5454         struct mem_cgroup *iter;
5455
5456         for_each_mem_cgroup_tree(iter, memcg)
5457                 mem_cgroup_oom_notify_cb(iter);
5458 }
5459
5460 static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
5461         struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
5462 {
5463         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5464         struct mem_cgroup_thresholds *thresholds;
5465         struct mem_cgroup_threshold_ary *new;
5466         enum res_type type = MEMFILE_TYPE(cft->private);
5467         u64 threshold, usage;
5468         int i, size, ret;
5469
5470         ret = res_counter_memparse_write_strategy(args, &threshold);
5471         if (ret)
5472                 return ret;
5473
5474         mutex_lock(&memcg->thresholds_lock);
5475
5476         if (type == _MEM)
5477                 thresholds = &memcg->thresholds;
5478         else if (type == _MEMSWAP)
5479                 thresholds = &memcg->memsw_thresholds;
5480         else
5481                 BUG();
5482
5483         usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5484
5485         /* Check if a threshold crossed before adding a new one */
5486         if (thresholds->primary)
5487                 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
5488
5489         size = thresholds->primary ? thresholds->primary->size + 1 : 1;
5490
5491         /* Allocate memory for new array of thresholds */
5492         new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
5493                         GFP_KERNEL);
5494         if (!new) {
5495                 ret = -ENOMEM;
5496                 goto unlock;
5497         }
5498         new->size = size;
5499
5500         /* Copy thresholds (if any) to new array */
5501         if (thresholds->primary) {
5502                 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
5503                                 sizeof(struct mem_cgroup_threshold));
5504         }
5505
5506         /* Add new threshold */
5507         new->entries[size - 1].eventfd = eventfd;
5508         new->entries[size - 1].threshold = threshold;
5509
5510         /* Sort thresholds. Registering of new threshold isn't time-critical */
5511         sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
5512                         compare_thresholds, NULL);
5513
5514         /* Find current threshold */
5515         new->current_threshold = -1;
5516         for (i = 0; i < size; i++) {
5517                 if (new->entries[i].threshold <= usage) {
5518                         /*
5519                          * new->current_threshold will not be used until
5520                          * rcu_assign_pointer(), so it's safe to increment
5521                          * it here.
5522                          */
5523                         ++new->current_threshold;
5524                 } else
5525                         break;
5526         }
5527
5528         /* Free old spare buffer and save old primary buffer as spare */
5529         kfree(thresholds->spare);
5530         thresholds->spare = thresholds->primary;
5531
5532         rcu_assign_pointer(thresholds->primary, new);
5533
5534         /* To be sure that nobody uses thresholds */
5535         synchronize_rcu();
5536
5537 unlock:
5538         mutex_unlock(&memcg->thresholds_lock);
5539
5540         return ret;
5541 }
5542
5543 static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
5544         struct cftype *cft, struct eventfd_ctx *eventfd)
5545 {
5546         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5547         struct mem_cgroup_thresholds *thresholds;
5548         struct mem_cgroup_threshold_ary *new;
5549         enum res_type type = MEMFILE_TYPE(cft->private);
5550         u64 usage;
5551         int i, j, size;
5552
5553         mutex_lock(&memcg->thresholds_lock);
5554         if (type == _MEM)
5555                 thresholds = &memcg->thresholds;
5556         else if (type == _MEMSWAP)
5557                 thresholds = &memcg->memsw_thresholds;
5558         else
5559                 BUG();
5560
5561         if (!thresholds->primary)
5562                 goto unlock;
5563
5564         usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5565
5566         /* Check if a threshold crossed before removing */
5567         __mem_cgroup_threshold(memcg, type == _MEMSWAP);
5568
5569         /* Calculate new number of threshold */
5570         size = 0;
5571         for (i = 0; i < thresholds->primary->size; i++) {
5572                 if (thresholds->primary->entries[i].eventfd != eventfd)
5573                         size++;
5574         }
5575
5576         new = thresholds->spare;
5577
5578         /* Set thresholds array to NULL if we don't have thresholds */
5579         if (!size) {
5580                 kfree(new);
5581                 new = NULL;
5582                 goto swap_buffers;
5583         }
5584
5585         new->size = size;
5586
5587         /* Copy thresholds and find current threshold */
5588         new->current_threshold = -1;
5589         for (i = 0, j = 0; i < thresholds->primary->size; i++) {
5590                 if (thresholds->primary->entries[i].eventfd == eventfd)
5591                         continue;
5592
5593                 new->entries[j] = thresholds->primary->entries[i];
5594                 if (new->entries[j].threshold <= usage) {
5595                         /*
5596                          * new->current_threshold will not be used
5597                          * until rcu_assign_pointer(), so it's safe to increment
5598                          * it here.
5599                          */
5600                         ++new->current_threshold;
5601                 }
5602                 j++;
5603         }
5604
5605 swap_buffers:
5606         /* Swap primary and spare array */
5607         thresholds->spare = thresholds->primary;
5608         /* If all events are unregistered, free the spare array */
5609         if (!new) {
5610                 kfree(thresholds->spare);
5611                 thresholds->spare = NULL;
5612         }
5613
5614         rcu_assign_pointer(thresholds->primary, new);
5615
5616         /* To be sure that nobody uses thresholds */
5617         synchronize_rcu();
5618 unlock:
5619         mutex_unlock(&memcg->thresholds_lock);
5620 }
5621
5622 static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
5623         struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
5624 {
5625         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5626         struct mem_cgroup_eventfd_list *event;
5627         enum res_type type = MEMFILE_TYPE(cft->private);
5628
5629         BUG_ON(type != _OOM_TYPE);
5630         event = kmalloc(sizeof(*event), GFP_KERNEL);
5631         if (!event)
5632                 return -ENOMEM;
5633
5634         spin_lock(&memcg_oom_lock);
5635
5636         event->eventfd = eventfd;
5637         list_add(&event->list, &memcg->oom_notify);
5638
5639         /* already in OOM ? */
5640         if (atomic_read(&memcg->under_oom))
5641                 eventfd_signal(eventfd, 1);
5642         spin_unlock(&memcg_oom_lock);
5643
5644         return 0;
5645 }
5646
5647 static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
5648         struct cftype *cft, struct eventfd_ctx *eventfd)
5649 {
5650         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5651         struct mem_cgroup_eventfd_list *ev, *tmp;
5652         enum res_type type = MEMFILE_TYPE(cft->private);
5653
5654         BUG_ON(type != _OOM_TYPE);
5655
5656         spin_lock(&memcg_oom_lock);
5657
5658         list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
5659                 if (ev->eventfd == eventfd) {
5660                         list_del(&ev->list);
5661                         kfree(ev);
5662                 }
5663         }
5664
5665         spin_unlock(&memcg_oom_lock);
5666 }
5667
5668 static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css,
5669         struct cftype *cft,  struct cgroup_map_cb *cb)
5670 {
5671         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5672
5673         cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
5674
5675         if (atomic_read(&memcg->under_oom))
5676                 cb->fill(cb, "under_oom", 1);
5677         else
5678                 cb->fill(cb, "under_oom", 0);
5679         return 0;
5680 }
5681
5682 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
5683         struct cftype *cft, u64 val)
5684 {
5685         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5686         struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5687
5688         /* cannot set to root cgroup and only 0 and 1 are allowed */
5689         if (!parent || !((val == 0) || (val == 1)))
5690                 return -EINVAL;
5691
5692         mutex_lock(&memcg_create_mutex);
5693         /* oom-kill-disable is a flag for subhierarchy. */
5694         if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5695                 mutex_unlock(&memcg_create_mutex);
5696                 return -EINVAL;
5697         }
5698         memcg->oom_kill_disable = val;
5699         if (!val)
5700                 memcg_oom_recover(memcg);
5701         mutex_unlock(&memcg_create_mutex);
5702         return 0;
5703 }
5704
5705 #ifdef CONFIG_MEMCG_KMEM
5706 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5707 {
5708         int ret;
5709
5710         memcg->kmemcg_id = -1;
5711         ret = memcg_propagate_kmem(memcg);
5712         if (ret)
5713                 return ret;
5714
5715         return mem_cgroup_sockets_init(memcg, ss);
5716 }
5717
5718 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
5719 {
5720         mem_cgroup_sockets_destroy(memcg);
5721 }
5722
5723 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5724 {
5725         if (!memcg_kmem_is_active(memcg))
5726                 return;
5727
5728         /*
5729          * kmem charges can outlive the cgroup. In the case of slab
5730          * pages, for instance, a page contain objects from various
5731          * processes. As we prevent from taking a reference for every
5732          * such allocation we have to be careful when doing uncharge
5733          * (see memcg_uncharge_kmem) and here during offlining.
5734          *
5735          * The idea is that that only the _last_ uncharge which sees
5736          * the dead memcg will drop the last reference. An additional
5737          * reference is taken here before the group is marked dead
5738          * which is then paired with css_put during uncharge resp. here.
5739          *
5740          * Although this might sound strange as this path is called from
5741          * css_offline() when the referencemight have dropped down to 0
5742          * and shouldn't be incremented anymore (css_tryget would fail)
5743          * we do not have other options because of the kmem allocations
5744          * lifetime.
5745          */
5746         css_get(&memcg->css);
5747
5748         memcg_kmem_mark_dead(memcg);
5749
5750         if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
5751                 return;
5752
5753         if (memcg_kmem_test_and_clear_dead(memcg))
5754                 css_put(&memcg->css);
5755 }
5756 #else
5757 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5758 {
5759         return 0;
5760 }
5761
5762 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
5763 {
5764 }
5765
5766 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5767 {
5768 }
5769 #endif
5770
5771 static struct cftype mem_cgroup_files[] = {
5772         {
5773                 .name = "usage_in_bytes",
5774                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
5775                 .read = mem_cgroup_read,
5776                 .register_event = mem_cgroup_usage_register_event,
5777                 .unregister_event = mem_cgroup_usage_unregister_event,
5778         },
5779         {
5780                 .name = "max_usage_in_bytes",
5781                 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
5782                 .trigger = mem_cgroup_reset,
5783                 .read = mem_cgroup_read,
5784         },
5785         {
5786                 .name = "limit_in_bytes",
5787                 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
5788                 .write_string = mem_cgroup_write,
5789                 .read = mem_cgroup_read,
5790         },
5791         {
5792                 .name = "soft_limit_in_bytes",
5793                 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
5794                 .write_string = mem_cgroup_write,
5795                 .read = mem_cgroup_read,
5796         },
5797         {
5798                 .name = "failcnt",
5799                 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
5800                 .trigger = mem_cgroup_reset,
5801                 .read = mem_cgroup_read,
5802         },
5803         {
5804                 .name = "stat",
5805                 .read_seq_string = memcg_stat_show,
5806         },
5807         {
5808                 .name = "force_empty",
5809                 .trigger = mem_cgroup_force_empty_write,
5810         },
5811         {
5812                 .name = "use_hierarchy",
5813                 .flags = CFTYPE_INSANE,
5814                 .write_u64 = mem_cgroup_hierarchy_write,
5815                 .read_u64 = mem_cgroup_hierarchy_read,
5816         },
5817         {
5818                 .name = "swappiness",
5819                 .read_u64 = mem_cgroup_swappiness_read,
5820                 .write_u64 = mem_cgroup_swappiness_write,
5821         },
5822         {
5823                 .name = "move_charge_at_immigrate",
5824                 .read_u64 = mem_cgroup_move_charge_read,
5825                 .write_u64 = mem_cgroup_move_charge_write,
5826         },
5827         {
5828                 .name = "oom_control",
5829                 .read_map = mem_cgroup_oom_control_read,
5830                 .write_u64 = mem_cgroup_oom_control_write,
5831                 .register_event = mem_cgroup_oom_register_event,
5832                 .unregister_event = mem_cgroup_oom_unregister_event,
5833                 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
5834         },
5835         {
5836                 .name = "pressure_level",
5837                 .register_event = vmpressure_register_event,
5838                 .unregister_event = vmpressure_unregister_event,
5839         },
5840 #ifdef CONFIG_NUMA
5841         {
5842                 .name = "numa_stat",
5843                 .read_seq_string = memcg_numa_stat_show,
5844         },
5845 #endif
5846 #ifdef CONFIG_MEMCG_KMEM
5847         {
5848                 .name = "kmem.limit_in_bytes",
5849                 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
5850                 .write_string = mem_cgroup_write,
5851                 .read = mem_cgroup_read,
5852         },
5853         {
5854                 .name = "kmem.usage_in_bytes",
5855                 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
5856                 .read = mem_cgroup_read,
5857         },
5858         {
5859                 .name = "kmem.failcnt",
5860                 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
5861                 .trigger = mem_cgroup_reset,
5862                 .read = mem_cgroup_read,
5863         },
5864         {
5865                 .name = "kmem.max_usage_in_bytes",
5866                 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
5867                 .trigger = mem_cgroup_reset,
5868                 .read = mem_cgroup_read,
5869         },
5870 #ifdef CONFIG_SLABINFO
5871         {
5872                 .name = "kmem.slabinfo",
5873                 .read_seq_string = mem_cgroup_slabinfo_read,
5874         },
5875 #endif
5876 #endif
5877         { },    /* terminate */
5878 };
5879
5880 #ifdef CONFIG_MEMCG_SWAP
5881 static struct cftype memsw_cgroup_files[] = {
5882         {
5883                 .name = "memsw.usage_in_bytes",
5884                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
5885                 .read = mem_cgroup_read,
5886                 .register_event = mem_cgroup_usage_register_event,
5887                 .unregister_event = mem_cgroup_usage_unregister_event,
5888         },
5889         {
5890                 .name = "memsw.max_usage_in_bytes",
5891                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
5892                 .trigger = mem_cgroup_reset,
5893                 .read = mem_cgroup_read,
5894         },
5895         {
5896                 .name = "memsw.limit_in_bytes",
5897                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
5898                 .write_string = mem_cgroup_write,
5899                 .read = mem_cgroup_read,
5900         },
5901         {
5902                 .name = "memsw.failcnt",
5903                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
5904                 .trigger = mem_cgroup_reset,
5905                 .read = mem_cgroup_read,
5906         },
5907         { },    /* terminate */
5908 };
5909 #endif
5910 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5911 {
5912         struct mem_cgroup_per_node *pn;
5913         struct mem_cgroup_per_zone *mz;
5914         int zone, tmp = node;
5915         /*
5916          * This routine is called against possible nodes.
5917          * But it's BUG to call kmalloc() against offline node.
5918          *
5919          * TODO: this routine can waste much memory for nodes which will
5920          *       never be onlined. It's better to use memory hotplug callback
5921          *       function.
5922          */
5923         if (!node_state(node, N_NORMAL_MEMORY))
5924                 tmp = -1;
5925         pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
5926         if (!pn)
5927                 return 1;
5928
5929         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5930                 mz = &pn->zoneinfo[zone];
5931                 lruvec_init(&mz->lruvec);
5932                 mz->memcg = memcg;
5933         }
5934         memcg->nodeinfo[node] = pn;
5935         return 0;
5936 }
5937
5938 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5939 {
5940         kfree(memcg->nodeinfo[node]);
5941 }
5942
5943 static struct mem_cgroup *mem_cgroup_alloc(void)
5944 {
5945         struct mem_cgroup *memcg;
5946         size_t size = memcg_size();
5947
5948         /* Can be very big if nr_node_ids is very big */
5949         if (size < PAGE_SIZE)
5950                 memcg = kzalloc(size, GFP_KERNEL);
5951         else
5952                 memcg = vzalloc(size);
5953
5954         if (!memcg)
5955                 return NULL;
5956
5957         memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
5958         if (!memcg->stat)
5959                 goto out_free;
5960         spin_lock_init(&memcg->pcp_counter_lock);
5961         return memcg;
5962
5963 out_free:
5964         if (size < PAGE_SIZE)
5965                 kfree(memcg);
5966         else
5967                 vfree(memcg);
5968         return NULL;
5969 }
5970
5971 /*
5972  * At destroying mem_cgroup, references from swap_cgroup can remain.
5973  * (scanning all at force_empty is too costly...)
5974  *
5975  * Instead of clearing all references at force_empty, we remember
5976  * the number of reference from swap_cgroup and free mem_cgroup when
5977  * it goes down to 0.
5978  *
5979  * Removal of cgroup itself succeeds regardless of refs from swap.
5980  */
5981
5982 static void __mem_cgroup_free(struct mem_cgroup *memcg)
5983 {
5984         int node;
5985         size_t size = memcg_size();
5986
5987         free_css_id(&mem_cgroup_subsys, &memcg->css);
5988
5989         for_each_node(node)
5990                 free_mem_cgroup_per_zone_info(memcg, node);
5991
5992         free_percpu(memcg->stat);
5993
5994         /*
5995          * We need to make sure that (at least for now), the jump label
5996          * destruction code runs outside of the cgroup lock. This is because
5997          * get_online_cpus(), which is called from the static_branch update,
5998          * can't be called inside the cgroup_lock. cpusets are the ones
5999          * enforcing this dependency, so if they ever change, we might as well.
6000          *
6001          * schedule_work() will guarantee this happens. Be careful if you need
6002          * to move this code around, and make sure it is outside
6003          * the cgroup_lock.
6004          */
6005         disarm_static_keys(memcg);
6006         if (size < PAGE_SIZE)
6007                 kfree(memcg);
6008         else
6009                 vfree(memcg);
6010 }
6011
6012 /*
6013  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
6014  */
6015 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
6016 {
6017         if (!memcg->res.parent)
6018                 return NULL;
6019         return mem_cgroup_from_res_counter(memcg->res.parent, res);
6020 }
6021 EXPORT_SYMBOL(parent_mem_cgroup);
6022
6023 static struct cgroup_subsys_state * __ref
6024 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6025 {
6026         struct mem_cgroup *memcg;
6027         long error = -ENOMEM;
6028         int node;
6029
6030         memcg = mem_cgroup_alloc();
6031         if (!memcg)
6032                 return ERR_PTR(error);
6033
6034         for_each_node(node)
6035                 if (alloc_mem_cgroup_per_zone_info(memcg, node))
6036                         goto free_out;
6037
6038         /* root ? */
6039         if (parent_css == NULL) {
6040                 root_mem_cgroup = memcg;
6041                 res_counter_init(&memcg->res, NULL);
6042                 res_counter_init(&memcg->memsw, NULL);
6043                 res_counter_init(&memcg->kmem, NULL);
6044         }
6045
6046         memcg->last_scanned_node = MAX_NUMNODES;
6047         INIT_LIST_HEAD(&memcg->oom_notify);
6048         memcg->move_charge_at_immigrate = 0;
6049         mutex_init(&memcg->thresholds_lock);
6050         spin_lock_init(&memcg->move_lock);
6051         vmpressure_init(&memcg->vmpressure);
6052         spin_lock_init(&memcg->soft_lock);
6053
6054         return &memcg->css;
6055
6056 free_out:
6057         __mem_cgroup_free(memcg);
6058         return ERR_PTR(error);
6059 }
6060
6061 static int
6062 mem_cgroup_css_online(struct cgroup_subsys_state *css)
6063 {
6064         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6065         struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
6066         int error = 0;
6067
6068         if (css->cgroup->id > MEM_CGROUP_ID_MAX)
6069                 return -ENOSPC;
6070
6071         if (!parent)
6072                 return 0;
6073
6074         mutex_lock(&memcg_create_mutex);
6075
6076         memcg->use_hierarchy = parent->use_hierarchy;
6077         memcg->oom_kill_disable = parent->oom_kill_disable;
6078         memcg->swappiness = mem_cgroup_swappiness(parent);
6079
6080         if (parent->use_hierarchy) {
6081                 res_counter_init(&memcg->res, &parent->res);
6082                 res_counter_init(&memcg->memsw, &parent->memsw);
6083                 res_counter_init(&memcg->kmem, &parent->kmem);
6084
6085                 /*
6086                  * No need to take a reference to the parent because cgroup
6087                  * core guarantees its existence.
6088                  */
6089         } else {
6090                 res_counter_init(&memcg->res, NULL);
6091                 res_counter_init(&memcg->memsw, NULL);
6092                 res_counter_init(&memcg->kmem, NULL);
6093                 /*
6094                  * Deeper hierachy with use_hierarchy == false doesn't make
6095                  * much sense so let cgroup subsystem know about this
6096                  * unfortunate state in our controller.
6097                  */
6098                 if (parent != root_mem_cgroup)
6099                         mem_cgroup_subsys.broken_hierarchy = true;
6100         }
6101
6102         error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
6103         mutex_unlock(&memcg_create_mutex);
6104         return error;
6105 }
6106
6107 /*
6108  * Announce all parents that a group from their hierarchy is gone.
6109  */
6110 static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6111 {
6112         struct mem_cgroup *parent = memcg;
6113
6114         while ((parent = parent_mem_cgroup(parent)))
6115                 mem_cgroup_iter_invalidate(parent);
6116
6117         /*
6118          * if the root memcg is not hierarchical we have to check it
6119          * explicitely.
6120          */
6121         if (!root_mem_cgroup->use_hierarchy)
6122                 mem_cgroup_iter_invalidate(root_mem_cgroup);
6123 }
6124
6125 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6126 {
6127         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6128
6129         kmem_cgroup_css_offline(memcg);
6130
6131         mem_cgroup_invalidate_reclaim_iterators(memcg);
6132         mem_cgroup_reparent_charges(memcg);
6133         if (memcg->soft_contributed) {
6134                 while ((memcg = parent_mem_cgroup(memcg)))
6135                         atomic_dec(&memcg->children_in_excess);
6136
6137                 if (memcg != root_mem_cgroup && !root_mem_cgroup->use_hierarchy)
6138                         atomic_dec(&root_mem_cgroup->children_in_excess);
6139         }
6140         mem_cgroup_destroy_all_caches(memcg);
6141         vmpressure_cleanup(&memcg->vmpressure);
6142 }
6143
6144 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
6145 {
6146         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6147
6148         memcg_destroy_kmem(memcg);
6149         __mem_cgroup_free(memcg);
6150 }
6151
6152 #ifdef CONFIG_MMU
6153 /* Handlers for move charge at task migration. */
6154 #define PRECHARGE_COUNT_AT_ONCE 256
6155 static int mem_cgroup_do_precharge(unsigned long count)
6156 {
6157         int ret = 0;
6158         int batch_count = PRECHARGE_COUNT_AT_ONCE;
6159         struct mem_cgroup *memcg = mc.to;
6160
6161         if (mem_cgroup_is_root(memcg)) {
6162                 mc.precharge += count;
6163                 /* we don't need css_get for root */
6164                 return ret;
6165         }
6166         /* try to charge at once */
6167         if (count > 1) {
6168                 struct res_counter *dummy;
6169                 /*
6170                  * "memcg" cannot be under rmdir() because we've already checked
6171                  * by cgroup_lock_live_cgroup() that it is not removed and we
6172                  * are still under the same cgroup_mutex. So we can postpone
6173                  * css_get().
6174                  */
6175                 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
6176                         goto one_by_one;
6177                 if (do_swap_account && res_counter_charge(&memcg->memsw,
6178                                                 PAGE_SIZE * count, &dummy)) {
6179                         res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
6180                         goto one_by_one;
6181                 }
6182                 mc.precharge += count;
6183                 return ret;
6184         }
6185 one_by_one:
6186         /* fall back to one by one charge */
6187         while (count--) {
6188                 if (signal_pending(current)) {
6189                         ret = -EINTR;
6190                         break;
6191                 }
6192                 if (!batch_count--) {
6193                         batch_count = PRECHARGE_COUNT_AT_ONCE;
6194                         cond_resched();
6195                 }
6196                 ret = __mem_cgroup_try_charge(NULL,
6197                                         GFP_KERNEL, 1, &memcg, false);
6198                 if (ret)
6199                         /* mem_cgroup_clear_mc() will do uncharge later */
6200                         return ret;
6201                 mc.precharge++;
6202         }
6203         return ret;
6204 }
6205
6206 /**
6207  * get_mctgt_type - get target type of moving charge
6208  * @vma: the vma the pte to be checked belongs
6209  * @addr: the address corresponding to the pte to be checked
6210  * @ptent: the pte to be checked
6211  * @target: the pointer the target page or swap ent will be stored(can be NULL)
6212  *
6213  * Returns
6214  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
6215  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
6216  *     move charge. if @target is not NULL, the page is stored in target->page
6217  *     with extra refcnt got(Callers should handle it).
6218  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
6219  *     target for charge migration. if @target is not NULL, the entry is stored
6220  *     in target->ent.
6221  *
6222  * Called with pte lock held.
6223  */
6224 union mc_target {
6225         struct page     *page;
6226         swp_entry_t     ent;
6227 };
6228
6229 enum mc_target_type {
6230         MC_TARGET_NONE = 0,
6231         MC_TARGET_PAGE,
6232         MC_TARGET_SWAP,
6233 };
6234
6235 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
6236                                                 unsigned long addr, pte_t ptent)
6237 {
6238         struct page *page = vm_normal_page(vma, addr, ptent);
6239
6240         if (!page || !page_mapped(page))
6241                 return NULL;
6242         if (PageAnon(page)) {
6243                 /* we don't move shared anon */
6244                 if (!move_anon())
6245                         return NULL;
6246         } else if (!move_file())
6247                 /* we ignore mapcount for file pages */
6248                 return NULL;
6249         if (!get_page_unless_zero(page))
6250                 return NULL;
6251
6252         return page;
6253 }
6254
6255 #ifdef CONFIG_SWAP
6256 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
6257                         unsigned long addr, pte_t ptent, swp_entry_t *entry)
6258 {
6259         struct page *page = NULL;
6260         swp_entry_t ent = pte_to_swp_entry(ptent);
6261
6262         if (!move_anon() || non_swap_entry(ent))
6263                 return NULL;
6264         /*
6265          * Because lookup_swap_cache() updates some statistics counter,
6266          * we call find_get_page() with swapper_space directly.
6267          */
6268         page = find_get_page(swap_address_space(ent), ent.val);
6269         if (do_swap_account)
6270                 entry->val = ent.val;
6271
6272         return page;
6273 }
6274 #else
6275 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
6276                         unsigned long addr, pte_t ptent, swp_entry_t *entry)
6277 {
6278         return NULL;
6279 }
6280 #endif
6281
6282 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
6283                         unsigned long addr, pte_t ptent, swp_entry_t *entry)
6284 {
6285         struct page *page = NULL;
6286         struct address_space *mapping;
6287         pgoff_t pgoff;
6288
6289         if (!vma->vm_file) /* anonymous vma */
6290                 return NULL;
6291         if (!move_file())
6292                 return NULL;
6293
6294         mapping = vma->vm_file->f_mapping;
6295         if (pte_none(ptent))
6296                 pgoff = linear_page_index(vma, addr);
6297         else /* pte_file(ptent) is true */
6298                 pgoff = pte_to_pgoff(ptent);
6299
6300         /* page is moved even if it's not RSS of this task(page-faulted). */
6301         page = find_get_page(mapping, pgoff);
6302
6303 #ifdef CONFIG_SWAP
6304         /* shmem/tmpfs may report page out on swap: account for that too. */
6305         if (radix_tree_exceptional_entry(page)) {
6306                 swp_entry_t swap = radix_to_swp_entry(page);
6307                 if (do_swap_account)
6308                         *entry = swap;
6309                 page = find_get_page(swap_address_space(swap), swap.val);
6310         }
6311 #endif
6312         return page;
6313 }
6314
6315 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
6316                 unsigned long addr, pte_t ptent, union mc_target *target)
6317 {
6318         struct page *page = NULL;
6319         struct page_cgroup *pc;
6320         enum mc_target_type ret = MC_TARGET_NONE;
6321         swp_entry_t ent = { .val = 0 };
6322
6323         if (pte_present(ptent))
6324                 page = mc_handle_present_pte(vma, addr, ptent);
6325         else if (is_swap_pte(ptent))
6326                 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
6327         else if (pte_none(ptent) || pte_file(ptent))
6328                 page = mc_handle_file_pte(vma, addr, ptent, &ent);
6329
6330         if (!page && !ent.val)
6331                 return ret;
6332         if (page) {
6333                 pc = lookup_page_cgroup(page);
6334                 /*
6335                  * Do only loose check w/o page_cgroup lock.
6336                  * mem_cgroup_move_account() checks the pc is valid or not under
6337                  * the lock.
6338                  */
6339                 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
6340                         ret = MC_TARGET_PAGE;
6341                         if (target)
6342                                 target->page = page;
6343                 }
6344                 if (!ret || !target)
6345                         put_page(page);
6346         }
6347         /* There is a swap entry and a page doesn't exist or isn't charged */
6348         if (ent.val && !ret &&
6349             mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
6350                 ret = MC_TARGET_SWAP;
6351                 if (target)
6352                         target->ent = ent;
6353         }
6354         return ret;
6355 }
6356
6357 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6358 /*
6359  * We don't consider swapping or file mapped pages because THP does not
6360  * support them for now.
6361  * Caller should make sure that pmd_trans_huge(pmd) is true.
6362  */
6363 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
6364                 unsigned long addr, pmd_t pmd, union mc_target *target)
6365 {
6366         struct page *page = NULL;
6367         struct page_cgroup *pc;
6368         enum mc_target_type ret = MC_TARGET_NONE;
6369
6370         page = pmd_page(pmd);
6371         VM_BUG_ON(!page || !PageHead(page));
6372         if (!move_anon())
6373                 return ret;
6374         pc = lookup_page_cgroup(page);
6375         if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
6376                 ret = MC_TARGET_PAGE;
6377                 if (target) {
6378                         get_page(page);
6379                         target->page = page;
6380                 }
6381         }
6382         return ret;
6383 }
6384 #else
6385 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
6386                 unsigned long addr, pmd_t pmd, union mc_target *target)
6387 {
6388         return MC_TARGET_NONE;
6389 }
6390 #endif
6391
6392 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
6393                                         unsigned long addr, unsigned long end,
6394                                         struct mm_walk *walk)
6395 {
6396         struct vm_area_struct *vma = walk->private;
6397         pte_t *pte;
6398         spinlock_t *ptl;
6399
6400         if (pmd_trans_huge_lock(pmd, vma) == 1) {
6401                 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
6402                         mc.precharge += HPAGE_PMD_NR;
6403                 spin_unlock(&vma->vm_mm->page_table_lock);
6404                 return 0;
6405         }
6406
6407         if (pmd_trans_unstable(pmd))
6408                 return 0;
6409         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6410         for (; addr != end; pte++, addr += PAGE_SIZE)
6411                 if (get_mctgt_type(vma, addr, *pte, NULL))
6412                         mc.precharge++; /* increment precharge temporarily */
6413         pte_unmap_unlock(pte - 1, ptl);
6414         cond_resched();
6415
6416         return 0;
6417 }
6418
6419 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
6420 {
6421         unsigned long precharge;
6422         struct vm_area_struct *vma;
6423
6424         down_read(&mm->mmap_sem);
6425         for (vma = mm->mmap; vma; vma = vma->vm_next) {
6426                 struct mm_walk mem_cgroup_count_precharge_walk = {
6427                         .pmd_entry = mem_cgroup_count_precharge_pte_range,
6428                         .mm = mm,
6429                         .private = vma,
6430                 };
6431                 if (is_vm_hugetlb_page(vma))
6432                         continue;
6433                 walk_page_range(vma->vm_start, vma->vm_end,
6434                                         &mem_cgroup_count_precharge_walk);
6435         }
6436         up_read(&mm->mmap_sem);
6437
6438         precharge = mc.precharge;
6439         mc.precharge = 0;
6440
6441         return precharge;
6442 }
6443
6444 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
6445 {
6446         unsigned long precharge = mem_cgroup_count_precharge(mm);
6447
6448         VM_BUG_ON(mc.moving_task);
6449         mc.moving_task = current;
6450         return mem_cgroup_do_precharge(precharge);
6451 }
6452
6453 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
6454 static void __mem_cgroup_clear_mc(void)
6455 {
6456         struct mem_cgroup *from = mc.from;
6457         struct mem_cgroup *to = mc.to;
6458         int i;
6459
6460         /* we must uncharge all the leftover precharges from mc.to */
6461         if (mc.precharge) {
6462                 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
6463                 mc.precharge = 0;
6464         }
6465         /*
6466          * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
6467          * we must uncharge here.
6468          */
6469         if (mc.moved_charge) {
6470                 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
6471                 mc.moved_charge = 0;
6472         }
6473         /* we must fixup refcnts and charges */
6474         if (mc.moved_swap) {
6475                 /* uncharge swap account from the old cgroup */
6476                 if (!mem_cgroup_is_root(mc.from))
6477                         res_counter_uncharge(&mc.from->memsw,
6478                                                 PAGE_SIZE * mc.moved_swap);
6479
6480                 for (i = 0; i < mc.moved_swap; i++)
6481                         css_put(&mc.from->css);
6482
6483                 if (!mem_cgroup_is_root(mc.to)) {
6484                         /*
6485                          * we charged both to->res and to->memsw, so we should
6486                          * uncharge to->res.
6487                          */
6488                         res_counter_uncharge(&mc.to->res,
6489                                                 PAGE_SIZE * mc.moved_swap);
6490                 }
6491                 /* we've already done css_get(mc.to) */
6492                 mc.moved_swap = 0;
6493         }
6494         memcg_oom_recover(from);
6495         memcg_oom_recover(to);
6496         wake_up_all(&mc.waitq);
6497 }
6498
6499 static void mem_cgroup_clear_mc(void)
6500 {
6501         struct mem_cgroup *from = mc.from;
6502
6503         /*
6504          * we must clear moving_task before waking up waiters at the end of
6505          * task migration.
6506          */
6507         mc.moving_task = NULL;
6508         __mem_cgroup_clear_mc();
6509         spin_lock(&mc.lock);
6510         mc.from = NULL;
6511         mc.to = NULL;
6512         spin_unlock(&mc.lock);
6513         mem_cgroup_end_move(from);
6514 }
6515
6516 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
6517                                  struct cgroup_taskset *tset)
6518 {
6519         struct task_struct *p = cgroup_taskset_first(tset);
6520         int ret = 0;
6521         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6522         unsigned long move_charge_at_immigrate;
6523
6524         /*
6525          * We are now commited to this value whatever it is. Changes in this
6526          * tunable will only affect upcoming migrations, not the current one.
6527          * So we need to save it, and keep it going.
6528          */
6529         move_charge_at_immigrate  = memcg->move_charge_at_immigrate;
6530         if (move_charge_at_immigrate) {
6531                 struct mm_struct *mm;
6532                 struct mem_cgroup *from = mem_cgroup_from_task(p);
6533
6534                 VM_BUG_ON(from == memcg);
6535
6536                 mm = get_task_mm(p);
6537                 if (!mm)
6538                         return 0;
6539                 /* We move charges only when we move a owner of the mm */
6540                 if (mm->owner == p) {
6541                         VM_BUG_ON(mc.from);
6542                         VM_BUG_ON(mc.to);
6543                         VM_BUG_ON(mc.precharge);
6544                         VM_BUG_ON(mc.moved_charge);
6545                         VM_BUG_ON(mc.moved_swap);
6546                         mem_cgroup_start_move(from);
6547                         spin_lock(&mc.lock);
6548                         mc.from = from;
6549                         mc.to = memcg;
6550                         mc.immigrate_flags = move_charge_at_immigrate;
6551                         spin_unlock(&mc.lock);
6552                         /* We set mc.moving_task later */
6553
6554                         ret = mem_cgroup_precharge_mc(mm);
6555                         if (ret)
6556                                 mem_cgroup_clear_mc();
6557                 }
6558                 mmput(mm);
6559         }
6560         return ret;
6561 }
6562
6563 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
6564                                      struct cgroup_taskset *tset)
6565 {
6566         mem_cgroup_clear_mc();
6567 }
6568
6569 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6570                                 unsigned long addr, unsigned long end,
6571                                 struct mm_walk *walk)
6572 {
6573         int ret = 0;
6574         struct vm_area_struct *vma = walk->private;
6575         pte_t *pte;
6576         spinlock_t *ptl;
6577         enum mc_target_type target_type;
6578         union mc_target target;
6579         struct page *page;
6580         struct page_cgroup *pc;
6581
6582         /*
6583          * We don't take compound_lock() here but no race with splitting thp
6584          * happens because:
6585          *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
6586          *    under splitting, which means there's no concurrent thp split,
6587          *  - if another thread runs into split_huge_page() just after we
6588          *    entered this if-block, the thread must wait for page table lock
6589          *    to be unlocked in __split_huge_page_splitting(), where the main
6590          *    part of thp split is not executed yet.
6591          */
6592         if (pmd_trans_huge_lock(pmd, vma) == 1) {
6593                 if (mc.precharge < HPAGE_PMD_NR) {
6594                         spin_unlock(&vma->vm_mm->page_table_lock);
6595                         return 0;
6596                 }
6597                 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
6598                 if (target_type == MC_TARGET_PAGE) {
6599                         page = target.page;
6600                         if (!isolate_lru_page(page)) {
6601                                 pc = lookup_page_cgroup(page);
6602                                 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
6603                                                         pc, mc.from, mc.to)) {
6604                                         mc.precharge -= HPAGE_PMD_NR;
6605                                         mc.moved_charge += HPAGE_PMD_NR;
6606                                 }
6607                                 putback_lru_page(page);
6608                         }
6609                         put_page(page);
6610                 }
6611                 spin_unlock(&vma->vm_mm->page_table_lock);
6612                 return 0;
6613         }
6614
6615         if (pmd_trans_unstable(pmd))
6616                 return 0;
6617 retry:
6618         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6619         for (; addr != end; addr += PAGE_SIZE) {
6620                 pte_t ptent = *(pte++);
6621                 swp_entry_t ent;
6622
6623                 if (!mc.precharge)
6624                         break;
6625
6626                 switch (get_mctgt_type(vma, addr, ptent, &target)) {
6627                 case MC_TARGET_PAGE:
6628                         page = target.page;
6629                         if (isolate_lru_page(page))
6630                                 goto put;
6631                         pc = lookup_page_cgroup(page);
6632                         if (!mem_cgroup_move_account(page, 1, pc,
6633                                                      mc.from, mc.to)) {
6634                                 mc.precharge--;
6635                                 /* we uncharge from mc.from later. */
6636                                 mc.moved_charge++;
6637                         }
6638                         putback_lru_page(page);
6639 put:                    /* get_mctgt_type() gets the page */
6640                         put_page(page);
6641                         break;
6642                 case MC_TARGET_SWAP:
6643                         ent = target.ent;
6644                         if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
6645                                 mc.precharge--;
6646                                 /* we fixup refcnts and charges later. */
6647                                 mc.moved_swap++;
6648                         }
6649                         break;
6650                 default:
6651                         break;
6652                 }
6653         }
6654         pte_unmap_unlock(pte - 1, ptl);
6655         cond_resched();
6656
6657         if (addr != end) {
6658                 /*
6659                  * We have consumed all precharges we got in can_attach().
6660                  * We try charge one by one, but don't do any additional
6661                  * charges to mc.to if we have failed in charge once in attach()
6662                  * phase.
6663                  */
6664                 ret = mem_cgroup_do_precharge(1);
6665                 if (!ret)
6666                         goto retry;
6667         }
6668
6669         return ret;
6670 }
6671
6672 static void mem_cgroup_move_charge(struct mm_struct *mm)
6673 {
6674         struct vm_area_struct *vma;
6675
6676         lru_add_drain_all();
6677 retry:
6678         if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
6679                 /*
6680                  * Someone who are holding the mmap_sem might be waiting in
6681                  * waitq. So we cancel all extra charges, wake up all waiters,
6682                  * and retry. Because we cancel precharges, we might not be able
6683                  * to move enough charges, but moving charge is a best-effort
6684                  * feature anyway, so it wouldn't be a big problem.
6685                  */
6686                 __mem_cgroup_clear_mc();
6687                 cond_resched();
6688                 goto retry;
6689         }
6690         for (vma = mm->mmap; vma; vma = vma->vm_next) {
6691                 int ret;
6692                 struct mm_walk mem_cgroup_move_charge_walk = {
6693                         .pmd_entry = mem_cgroup_move_charge_pte_range,
6694                         .mm = mm,
6695                         .private = vma,
6696                 };
6697                 if (is_vm_hugetlb_page(vma))
6698                         continue;
6699                 ret = walk_page_range(vma->vm_start, vma->vm_end,
6700                                                 &mem_cgroup_move_charge_walk);
6701                 if (ret)
6702                         /*
6703                          * means we have consumed all precharges and failed in
6704                          * doing additional charge. Just abandon here.
6705                          */
6706                         break;
6707         }
6708         up_read(&mm->mmap_sem);
6709 }
6710
6711 static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
6712                                  struct cgroup_taskset *tset)
6713 {
6714         struct task_struct *p = cgroup_taskset_first(tset);
6715         struct mm_struct *mm = get_task_mm(p);
6716
6717         if (mm) {
6718                 if (mc.to)
6719                         mem_cgroup_move_charge(mm);
6720                 mmput(mm);
6721         }
6722         if (mc.to)
6723                 mem_cgroup_clear_mc();
6724 }
6725 #else   /* !CONFIG_MMU */
6726 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
6727                                  struct cgroup_taskset *tset)
6728 {
6729         return 0;
6730 }
6731 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
6732                                      struct cgroup_taskset *tset)
6733 {
6734 }
6735 static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
6736                                  struct cgroup_taskset *tset)
6737 {
6738 }
6739 #endif
6740
6741 /*
6742  * Cgroup retains root cgroups across [un]mount cycles making it necessary
6743  * to verify sane_behavior flag on each mount attempt.
6744  */
6745 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
6746 {
6747         /*
6748          * use_hierarchy is forced with sane_behavior.  cgroup core
6749          * guarantees that @root doesn't have any children, so turning it
6750          * on for the root memcg is enough.
6751          */
6752         if (cgroup_sane_behavior(root_css->cgroup))
6753                 mem_cgroup_from_css(root_css)->use_hierarchy = true;
6754 }
6755
6756 struct cgroup_subsys mem_cgroup_subsys = {
6757         .name = "memory",
6758         .subsys_id = mem_cgroup_subsys_id,
6759         .css_alloc = mem_cgroup_css_alloc,
6760         .css_online = mem_cgroup_css_online,
6761         .css_offline = mem_cgroup_css_offline,
6762         .css_free = mem_cgroup_css_free,
6763         .can_attach = mem_cgroup_can_attach,
6764         .cancel_attach = mem_cgroup_cancel_attach,
6765         .attach = mem_cgroup_move_task,
6766         .bind = mem_cgroup_bind,
6767         .base_cftypes = mem_cgroup_files,
6768         .early_init = 0,
6769         .use_id = 1,
6770 };
6771
6772 #ifdef CONFIG_MEMCG_SWAP
6773 static int __init enable_swap_account(char *s)
6774 {
6775         if (!strcmp(s, "1"))
6776                 really_do_swap_account = 1;
6777         else if (!strcmp(s, "0"))
6778                 really_do_swap_account = 0;
6779         return 1;
6780 }
6781 __setup("swapaccount=", enable_swap_account);
6782
6783 static void __init memsw_file_init(void)
6784 {
6785         WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
6786 }
6787
6788 static void __init enable_swap_cgroup(void)
6789 {
6790         if (!mem_cgroup_disabled() && really_do_swap_account) {
6791                 do_swap_account = 1;
6792                 memsw_file_init();
6793         }
6794 }
6795
6796 #else
6797 static void __init enable_swap_cgroup(void)
6798 {
6799 }
6800 #endif
6801
6802 /*
6803  * subsys_initcall() for memory controller.
6804  *
6805  * Some parts like hotcpu_notifier() have to be initialized from this context
6806  * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
6807  * everything that doesn't depend on a specific mem_cgroup structure should
6808  * be initialized from here.
6809  */
6810 static int __init mem_cgroup_init(void)
6811 {
6812         hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6813         enable_swap_cgroup();
6814         memcg_stock_init();
6815         return 0;
6816 }
6817 subsys_initcall(mem_cgroup_init);