memcg: rename cgroup_event to mem_cgroup_event

[~andy/linux] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 1c52ddbc839ba1f8f42e940c51bc321ba6b2abfe..ec8582b3a2324bbd0b473b3d555e07f6474f1ac1 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,6 +45,7 @@
  #include <linux/swapops.h>
  #include <linux/spinlock.h>
  #include <linux/eventfd.h>
+#include <linux/poll.h>
  #include <linux/sort.h>
  #include <linux/fs.h>
  #include <linux/seq_file.h>
@@ -54,6 +55,8 @@
  #include <linux/page_cgroup.h>
  #include <linux/cpu.h>
  #include <linux/oom.h>
+#include <linux/lockdep.h>
+#include <linux/file.h>
  #include "internal.h"
  #include <net/sock.h>
  #include <net/ip.h>
@@ -225,6 +228,46 @@ struct mem_cgroup_eventfd_list {
         struct eventfd_ctx *eventfd;
  };
  
+/*
+ * cgroup_event represents events which userspace want to receive.
+ */
+struct mem_cgroup_event {
+       /*
+        * memcg which the event belongs to.
+        */
+       struct mem_cgroup *memcg;
+       /*
+        * eventfd to signal userspace about the event.
+        */
+       struct eventfd_ctx *eventfd;
+       /*
+        * Each of these stored in a list by the cgroup.
+        */
+       struct list_head list;
+       /*
+        * register_event() callback will be used to add new userspace
+        * waiter for changes related to this event.  Use eventfd_signal()
+        * on eventfd to send notification to userspace.
+        */
+       int (*register_event)(struct mem_cgroup *memcg,
+                             struct eventfd_ctx *eventfd, const char *args);
+       /*
+        * unregister_event() callback will be called when userspace closes
+        * the eventfd or on cgroup removing.  This callback must be set,
+        * if you want provide notification functionality.
+        */
+       void (*unregister_event)(struct mem_cgroup *memcg,
+                                struct eventfd_ctx *eventfd);
+       /*
+        * All fields below needed to unregister event when
+        * userspace closes eventfd.
+        */
+       poll_table pt;
+       wait_queue_head_t *wqh;
+       wait_queue_t wait;
+       struct work_struct remove;
+};
+
  static void mem_cgroup_threshold(struct mem_cgroup *memcg);
  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
  
@@ -329,6 +372,10 @@ struct mem_cgroup {
         atomic_t        numainfo_updating;
  #endif
  
+       /* List of events which userspace want to receive */
+       struct list_head event_list;
+       spinlock_t event_list_lock;
+
         struct mem_cgroup_per_node *nodeinfo[0];
         /* WARNING: nodeinfo must be the last member here */
  };
@@ -488,11 +535,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
         return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
  }
  
-struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
-{
-       return &mem_cgroup_from_css(css)->vmpressure;
-}
-
  static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
  {
         return (memcg == root_mem_cgroup);
@@ -866,6 +908,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
         unsigned long val = 0;
         int cpu;
  
+       get_online_cpus();
         for_each_online_cpu(cpu)
                 val += per_cpu(memcg->stat->events[idx], cpu);
  #ifdef CONFIG_HOTPLUG_CPU
@@ -873,6 +916,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
         val += memcg->nocpu_base.events[idx];
         spin_unlock(&memcg->pcp_counter_lock);
  #endif
+       put_online_cpus();
         return val;
  }
  
@@ -2044,6 +2088,12 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
         return total;
  }
  
+#ifdef CONFIG_LOCKDEP
+static struct lockdep_map memcg_oom_lock_dep_map = {
+       .name = "memcg_oom_lock",
+};
+#endif
+
  static DEFINE_SPINLOCK(memcg_oom_lock);
  
  /*
@@ -2081,7 +2131,8 @@ static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
                         }
                         iter->oom_lock = false;
                 }
-       }
+       } else
+               mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
  
         spin_unlock(&memcg_oom_lock);
  
@@ -2093,6 +2144,7 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
         struct mem_cgroup *iter;
  
         spin_lock(&memcg_oom_lock);
+       mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
         for_each_mem_cgroup_tree(iter, memcg)
                 iter->oom_lock = false;
         spin_unlock(&memcg_oom_lock);
@@ -2159,110 +2211,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
                 memcg_wakeup_oom(memcg);
  }
  
-/*
- * try to call OOM killer
- */
  static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
  {
-       bool locked;
-       int wakeups;
-
         if (!current->memcg_oom.may_oom)
                 return;
-
-       current->memcg_oom.in_memcg_oom = 1;
-
         /*
-        * As with any blocking lock, a contender needs to start
-        * listening for wakeups before attempting the trylock,
-        * otherwise it can miss the wakeup from the unlock and sleep
-        * indefinitely.  This is just open-coded because our locking
-        * is so particular to memcg hierarchies.
+        * We are in the middle of the charge context here, so we
+        * don't want to block when potentially sitting on a callstack
+        * that holds all kinds of filesystem and mm locks.
+        *
+        * Also, the caller may handle a failed allocation gracefully
+        * (like optional page cache readahead) and so an OOM killer
+        * invocation might not even be necessary.
+        *
+        * That's why we don't do anything here except remember the
+        * OOM context and then deal with it at the end of the page
+        * fault when the stack is unwound, the locks are released,
+        * and when we know whether the fault was overall successful.
          */
-       wakeups = atomic_read(&memcg->oom_wakeups);
-       mem_cgroup_mark_under_oom(memcg);
-
-       locked = mem_cgroup_oom_trylock(memcg);
-
-       if (locked)
-               mem_cgroup_oom_notify(memcg);
-
-       if (locked && !memcg->oom_kill_disable) {
-               mem_cgroup_unmark_under_oom(memcg);
-               mem_cgroup_out_of_memory(memcg, mask, order);
-               mem_cgroup_oom_unlock(memcg);
-               /*
-                * There is no guarantee that an OOM-lock contender
-                * sees the wakeups triggered by the OOM kill
-                * uncharges.  Wake any sleepers explicitely.
-                */
-               memcg_oom_recover(memcg);
-       } else {
-               /*
-                * A system call can just return -ENOMEM, but if this
-                * is a page fault and somebody else is handling the
-                * OOM already, we need to sleep on the OOM waitqueue
-                * for this memcg until the situation is resolved.
-                * Which can take some time because it might be
-                * handled by a userspace task.
-                *
-                * However, this is the charge context, which means
-                * that we may sit on a large call stack and hold
-                * various filesystem locks, the mmap_sem etc. and we
-                * don't want the OOM handler to deadlock on them
-                * while we sit here and wait.  Store the current OOM
-                * context in the task_struct, then return -ENOMEM.
-                * At the end of the page fault handler, with the
-                * stack unwound, pagefault_out_of_memory() will check
-                * back with us by calling
-                * mem_cgroup_oom_synchronize(), possibly putting the
-                * task to sleep.
-                */
-               current->memcg_oom.oom_locked = locked;
-               current->memcg_oom.wakeups = wakeups;
-               css_get(&memcg->css);
-               current->memcg_oom.wait_on_memcg = memcg;
-       }
+       css_get(&memcg->css);
+       current->memcg_oom.memcg = memcg;
+       current->memcg_oom.gfp_mask = mask;
+       current->memcg_oom.order = order;
  }
  
  /**
   * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
   *
- * This has to be called at the end of a page fault if the the memcg
- * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ * This has to be called at the end of a page fault if the memcg OOM
+ * handler was enabled.
   *
- * Memcg supports userspace OOM handling, so failed allocations must
+ * Memcg supports userspace OOM handling where failed allocations must
   * sleep on a waitqueue until the userspace task resolves the
   * situation.  Sleeping directly in the charge context with all kinds
   * of locks held is not a good idea, instead we remember an OOM state
   * in the task and mem_cgroup_oom_synchronize() has to be called at
- * the end of the page fault to put the task to sleep and clean up the
- * OOM state.
+ * the end of the page fault to complete the OOM handling.
   *
   * Returns %true if an ongoing memcg OOM situation was detected and
- * finalized, %false otherwise.
+ * completed, %false otherwise.
   */
-bool mem_cgroup_oom_synchronize(void)
+bool mem_cgroup_oom_synchronize(bool handle)
  {
+       struct mem_cgroup *memcg = current->memcg_oom.memcg;
         struct oom_wait_info owait;
-       struct mem_cgroup *memcg;
+       bool locked;
  
         /* OOM is global, do not handle */
-       if (!current->memcg_oom.in_memcg_oom)
-               return false;
-
-       /*
-        * We invoked the OOM killer but there is a chance that a kill
-        * did not free up any charges.  Everybody else might already
-        * be sleeping, so restart the fault and keep the rampage
-        * going until some charges are released.
-        */
-       memcg = current->memcg_oom.wait_on_memcg;
         if (!memcg)
-               goto out;
+               return false;
  
-       if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
-               goto out_memcg;
+       if (!handle)
+               goto cleanup;
  
         owait.memcg = memcg;
         owait.wait.flags = 0;
@@ -2271,13 +2272,25 @@ bool mem_cgroup_oom_synchronize(void)
         INIT_LIST_HEAD(&owait.wait.task_list);
  
         prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-       /* Only sleep if we didn't miss any wakeups since OOM */
-       if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+       mem_cgroup_mark_under_oom(memcg);
+
+       locked = mem_cgroup_oom_trylock(memcg);
+
+       if (locked)
+               mem_cgroup_oom_notify(memcg);
+
+       if (locked && !memcg->oom_kill_disable) {
+               mem_cgroup_unmark_under_oom(memcg);
+               finish_wait(&memcg_oom_waitq, &owait.wait);
+               mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
+                                        current->memcg_oom.order);
+       } else {
                 schedule();
-       finish_wait(&memcg_oom_waitq, &owait.wait);
-out_memcg:
-       mem_cgroup_unmark_under_oom(memcg);
-       if (current->memcg_oom.oom_locked) {
+               mem_cgroup_unmark_under_oom(memcg);
+               finish_wait(&memcg_oom_waitq, &owait.wait);
+       }
+
+       if (locked) {
                 mem_cgroup_oom_unlock(memcg);
                 /*
                  * There is no guarantee that an OOM-lock contender
@@ -2286,10 +2299,9 @@ out_memcg:
                  */
                 memcg_oom_recover(memcg);
         }
+cleanup:
+       current->memcg_oom.memcg = NULL;
         css_put(&memcg->css);
-       current->memcg_oom.wait_on_memcg = NULL;
-out:
-       current->memcg_oom.in_memcg_oom = 0;
         return true;
  }
  
@@ -2703,6 +2715,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                      || fatal_signal_pending(current)))
                 goto bypass;
  
+       if (unlikely(task_in_memcg_oom(current)))
+               goto bypass;
+
         /*
          * We always charge the cgroup the mm_struct belongs to.
          * The mm_struct's mem_cgroup changes on task migration if the
@@ -2800,8 +2815,10 @@ done:
         *ptr = memcg;
         return 0;
  nomem:
-       *ptr = NULL;
-       return -ENOMEM;
+       if (!(gfp_mask & __GFP_NOFAIL)) {
+               *ptr = NULL;
+               return -ENOMEM;
+       }
  bypass:
         *ptr = root_mem_cgroup;
         return -EINTR;
@@ -3806,8 +3823,7 @@ void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
  {
         /* Update stat data for mem_cgroup */
         preempt_disable();
-       WARN_ON_ONCE(from->stat->count[idx] < nr_pages);
-       __this_cpu_add(from->stat->count[idx], -nr_pages);
+       __this_cpu_sub(from->stat->count[idx], nr_pages);
         __this_cpu_add(to->stat->count[idx], nr_pages);
         preempt_enable();
  }
@@ -4983,31 +4999,18 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
         } while (usage > 0);
  }
  
-/*
- * This mainly exists for tests during the setting of set of use_hierarchy.
- * Since this is the very setting we are changing, the current hierarchy value
- * is meaningless
- */
-static inline bool __memcg_has_children(struct mem_cgroup *memcg)
-{
-       struct cgroup_subsys_state *pos;
-
-       /* bounce at first found */
-       css_for_each_child(pos, &memcg->css)
-               return true;
-       return false;
-}
-
-/*
- * Must be called with memcg_create_mutex held, unless the cgroup is guaranteed
- * to be already dead (as in mem_cgroup_force_empty, for instance).  This is
- * from mem_cgroup_count_children(), in the sense that we don't really care how
- * many children we have; we only need to know if we have any.  It also counts
- * any memcg without hierarchy as infertile.
- */
  static inline bool memcg_has_children(struct mem_cgroup *memcg)
  {
-       return memcg->use_hierarchy && __memcg_has_children(memcg);
+       lockdep_assert_held(&memcg_create_mutex);
+       /*
+        * The lock does not prevent addition or deletion to the list
+        * of children, but it prevents a new child from being
+        * initialized based on this parent in css_online(), so it's
+        * enough to decide whether hierarchically inherited
+        * attributes can still be changed or not.
+        */
+       return memcg->use_hierarchy &&
+               !list_empty(&memcg->css.cgroup->children);
  }
  
  /*
@@ -5087,7 +5090,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
          */
         if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
                                 (val == 1 || val == 0)) {
-               if (!__memcg_has_children(memcg))
+               if (list_empty(&memcg->css.cgroup->children))
                         memcg->use_hierarchy = val;
                 else
                         retval = -EBUSY;
@@ -5674,13 +5677,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
                 mem_cgroup_oom_notify_cb(iter);
  }
  
-static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
-       struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, const char *args, enum res_type type)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup_thresholds *thresholds;
         struct mem_cgroup_threshold_ary *new;
-       enum res_type type = MEMFILE_TYPE(cft->private);
         u64 threshold, usage;
         int i, size, ret;
  
@@ -5757,13 +5758,23 @@ unlock:
         return ret;
  }
  
-static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
-       struct cftype *cft, struct eventfd_ctx *eventfd)
+static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, const char *args)
+{
+       return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+}
+
+static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, const char *args)
+{
+       return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+}
+
+static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, enum res_type type)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup_thresholds *thresholds;
         struct mem_cgroup_threshold_ary *new;
-       enum res_type type = MEMFILE_TYPE(cft->private);
         u64 usage;
         int i, j, size;
  
@@ -5836,14 +5847,23 @@ unlock:
         mutex_unlock(&memcg->thresholds_lock);
  }
  
-static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
-       struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd)
+{
+       return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+}
+
+static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd)
+{
+       return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+}
+
+static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, const char *args)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup_eventfd_list *event;
-       enum res_type type = MEMFILE_TYPE(cft->private);
  
-       BUG_ON(type != _OOM_TYPE);
         event = kmalloc(sizeof(*event), GFP_KERNEL);
         if (!event)
                 return -ENOMEM;
@@ -5861,14 +5881,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
         return 0;
  }
  
-static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
-       struct cftype *cft, struct eventfd_ctx *eventfd)
+static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup_eventfd_list *ev, *tmp;
-       enum res_type type = MEMFILE_TYPE(cft->private);
-
-       BUG_ON(type != _OOM_TYPE);
  
         spin_lock(&memcg_oom_lock);
  
@@ -5985,13 +6001,233 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
  }
  #endif
  
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * "cgroup.event_control" implementation.
+ *
+ * This is way over-engineered.  It tries to support fully configurable
+ * events for each user.  Such level of flexibility is completely
+ * unnecessary especially in the light of the planned unified hierarchy.
+ *
+ * Please deprecate this and replace with something simpler if at all
+ * possible.
+ */
+
+/*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void memcg_event_remove(struct work_struct *work)
+{
+       struct mem_cgroup_event *event =
+               container_of(work, struct mem_cgroup_event, remove);
+       struct mem_cgroup *memcg = event->memcg;
+
+       remove_wait_queue(event->wqh, &event->wait);
+
+       event->unregister_event(memcg, event->eventfd);
+
+       /* Notify userspace the event is going away. */
+       eventfd_signal(event->eventfd, 1);
+
+       eventfd_ctx_put(event->eventfd);
+       kfree(event);
+       css_put(&memcg->css);
+}
+
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+                           int sync, void *key)
+{
+       struct mem_cgroup_event *event =
+               container_of(wait, struct mem_cgroup_event, wait);
+       struct mem_cgroup *memcg = event->memcg;
+       unsigned long flags = (unsigned long)key;
+
+       if (flags & POLLHUP) {
+               /*
+                * If the event has been detached at cgroup removal, we
+                * can simply return knowing the other side will cleanup
+                * for us.
+                *
+                * We can't race against event freeing since the other
+                * side will require wqh->lock via remove_wait_queue(),
+                * which we hold.
+                */
+               spin_lock(&memcg->event_list_lock);
+               if (!list_empty(&event->list)) {
+                       list_del_init(&event->list);
+                       /*
+                        * We are in atomic context, but cgroup_event_remove()
+                        * may sleep, so we have to call it in workqueue.
+                        */
+                       schedule_work(&event->remove);
+               }
+               spin_unlock(&memcg->event_list_lock);
+       }
+
+       return 0;
+}
+
+static void memcg_event_ptable_queue_proc(struct file *file,
+               wait_queue_head_t *wqh, poll_table *pt)
+{
+       struct mem_cgroup_event *event =
+               container_of(pt, struct mem_cgroup_event, pt);
+
+       event->wqh = wqh;
+       add_wait_queue(wqh, &event->wait);
+}
+
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static int memcg_write_event_control(struct cgroup_subsys_state *css,
+                                    struct cftype *cft, const char *buffer)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       struct mem_cgroup_event *event;
+       struct cgroup_subsys_state *cfile_css;
+       unsigned int efd, cfd;
+       struct fd efile;
+       struct fd cfile;
+       const char *name;
+       char *endp;
+       int ret;
+
+       efd = simple_strtoul(buffer, &endp, 10);
+       if (*endp != ' ')
+               return -EINVAL;
+       buffer = endp + 1;
+
+       cfd = simple_strtoul(buffer, &endp, 10);
+       if ((*endp != ' ') && (*endp != '\0'))
+               return -EINVAL;
+       buffer = endp + 1;
+
+       event = kzalloc(sizeof(*event), GFP_KERNEL);
+       if (!event)
+               return -ENOMEM;
+
+       event->memcg = memcg;
+       INIT_LIST_HEAD(&event->list);
+       init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
+       init_waitqueue_func_entry(&event->wait, memcg_event_wake);
+       INIT_WORK(&event->remove, memcg_event_remove);
+
+       efile = fdget(efd);
+       if (!efile.file) {
+               ret = -EBADF;
+               goto out_kfree;
+       }
+
+       event->eventfd = eventfd_ctx_fileget(efile.file);
+       if (IS_ERR(event->eventfd)) {
+               ret = PTR_ERR(event->eventfd);
+               goto out_put_efile;
+       }
+
+       cfile = fdget(cfd);
+       if (!cfile.file) {
+               ret = -EBADF;
+               goto out_put_eventfd;
+       }
+
+       /* the process need read permission on control file */
+       /* AV: shouldn't we check that it's been opened for read instead? */
+       ret = inode_permission(file_inode(cfile.file), MAY_READ);
+       if (ret < 0)
+               goto out_put_cfile;
+
+       /*
+        * Determine the event callbacks and set them in @event.  This used
+        * to be done via struct cftype but cgroup core no longer knows
+        * about these events.  The following is crude but the whole thing
+        * is for compatibility anyway.
+        *
+        * DO NOT ADD NEW FILES.
+        */
+       name = cfile.file->f_dentry->d_name.name;
+
+       if (!strcmp(name, "memory.usage_in_bytes")) {
+               event->register_event = mem_cgroup_usage_register_event;
+               event->unregister_event = mem_cgroup_usage_unregister_event;
+       } else if (!strcmp(name, "memory.oom_control")) {
+               event->register_event = mem_cgroup_oom_register_event;
+               event->unregister_event = mem_cgroup_oom_unregister_event;
+       } else if (!strcmp(name, "memory.pressure_level")) {
+               event->register_event = vmpressure_register_event;
+               event->unregister_event = vmpressure_unregister_event;
+       } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
+               event->register_event = memsw_cgroup_usage_register_event;
+               event->unregister_event = memsw_cgroup_usage_unregister_event;
+       } else {
+               ret = -EINVAL;
+               goto out_put_cfile;
+       }
+
+       /*
+        * Verify @cfile should belong to @css.  Also, remaining events are
+        * automatically removed on cgroup destruction but the removal is
+        * asynchronous, so take an extra ref on @css.
+        */
+       rcu_read_lock();
+
+       ret = -EINVAL;
+       cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
+                                &mem_cgroup_subsys);
+       if (cfile_css == css && css_tryget(css))
+               ret = 0;
+
+       rcu_read_unlock();
+       if (ret)
+               goto out_put_cfile;
+
+       ret = event->register_event(memcg, event->eventfd, buffer);
+       if (ret)
+               goto out_put_css;
+
+       efile.file->f_op->poll(efile.file, &event->pt);
+
+       spin_lock(&memcg->event_list_lock);
+       list_add(&event->list, &memcg->event_list);
+       spin_unlock(&memcg->event_list_lock);
+
+       fdput(cfile);
+       fdput(efile);
+
+       return 0;
+
+out_put_css:
+       css_put(css);
+out_put_cfile:
+       fdput(cfile);
+out_put_eventfd:
+       eventfd_ctx_put(event->eventfd);
+out_put_efile:
+       fdput(efile);
+out_kfree:
+       kfree(event);
+
+       return ret;
+}
+
  static struct cftype mem_cgroup_files[] = {
         {
                 .name = "usage_in_bytes",
                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
                 .read = mem_cgroup_read,
-               .register_event = mem_cgroup_usage_register_event,
-               .unregister_event = mem_cgroup_usage_unregister_event,
         },
         {
                 .name = "max_usage_in_bytes",
@@ -6031,6 +6267,12 @@ static struct cftype mem_cgroup_files[] = {
                 .write_u64 = mem_cgroup_hierarchy_write,
                 .read_u64 = mem_cgroup_hierarchy_read,
         },
+       {
+               .name = "cgroup.event_control",         /* XXX: for compat */
+               .write_string = memcg_write_event_control,
+               .flags = CFTYPE_NO_PREFIX,
+               .mode = S_IWUGO,
+       },
         {
                 .name = "swappiness",
                 .read_u64 = mem_cgroup_swappiness_read,
@@ -6045,14 +6287,10 @@ static struct cftype mem_cgroup_files[] = {
                 .name = "oom_control",
                 .read_map = mem_cgroup_oom_control_read,
                 .write_u64 = mem_cgroup_oom_control_write,
-               .register_event = mem_cgroup_oom_register_event,
-               .unregister_event = mem_cgroup_oom_unregister_event,
                 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
         },
         {
                 .name = "pressure_level",
-               .register_event = vmpressure_register_event,
-               .unregister_event = vmpressure_unregister_event,
         },
  #ifdef CONFIG_NUMA
         {
@@ -6100,8 +6338,6 @@ static struct cftype memsw_cgroup_files[] = {
                 .name = "memsw.usage_in_bytes",
                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
                 .read = mem_cgroup_read,
-               .register_event = mem_cgroup_usage_register_event,
-               .unregister_event = mem_cgroup_usage_unregister_event,
         },
         {
                 .name = "memsw.max_usage_in_bytes",
@@ -6292,6 +6528,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
         mutex_init(&memcg->thresholds_lock);
         spin_lock_init(&memcg->move_lock);
         vmpressure_init(&memcg->vmpressure);
+       INIT_LIST_HEAD(&memcg->event_list);
+       spin_lock_init(&memcg->event_list_lock);
  
         return &memcg->css;
  
@@ -6364,6 +6602,19 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
  static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       struct mem_cgroup_event *event, *tmp;
+
+       /*
+        * Unregister events and notify userspace.
+        * Notify userspace about cgroup removing only after rmdir of cgroup
+        * directory to avoid race between userspace and kernelspace.
+        */
+       spin_lock(&memcg->event_list_lock);
+       list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
+               list_del_init(&event->list);
+               schedule_work(&event->remove);
+       }
+       spin_unlock(&memcg->event_list_lock);
  
         kmem_cgroup_css_offline(memcg);