cgroup: Merge branch 'memcg_event' into for-3.14

author Tejun Heo <tj@kernel.org>

Fri, 22 Nov 2013 23:32:25 +0000 (18:32 -0500)

committer Tejun Heo <tj@kernel.org>

Fri, 22 Nov 2013 23:32:25 +0000 (18:32 -0500)
author Tejun Heo <tj@kernel.org>
Fri, 22 Nov 2013 23:32:25 +0000 (18:32 -0500)
committer Tejun Heo <tj@kernel.org>
Fri, 22 Nov 2013 23:32:25 +0000 (18:32 -0500)
diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt

index 638bf17ff86965a561b5cc258451ea53bd752f1a..821de56d15802c3e41a18d4aac41dbb64f272832 100644 (file)
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -24,7 +24,6 @@ CONTENTS:
    2.1 Basic Usage
    2.2 Attaching processes
    2.3 Mounting hierarchies by name
-  2.4 Notification API
  3. Kernel API
    3.1 Overview
    3.2 Synchronization
@@ -472,25 +471,6 @@ you give a subsystem a name.
  The name of the subsystem appears as part of the hierarchy description
  in /proc/mounts and /proc/<pid>/cgroups.
  
-2.4 Notification API
---------------------
-
-There is mechanism which allows to get notifications about changing
-status of a cgroup.
-
-To register a new notification handler you need to:
- - create a file descriptor for event notification using eventfd(2);
- - open a control file to be monitored (e.g. memory.usage_in_bytes);
- - write "<event_fd> <control_fd> <args>" to cgroup.event_control.
-   Interpretation of args is defined by control file implementation;
-
-eventfd will be woken up by control file implementation or when the
-cgroup is removed.
-
-To unregister a notification handler just close eventfd.
-
-NOTE: Support of notifications should be implemented for the control
-file. See documentation for the subsystem.
  
  3. Kernel API
  =============
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index 39c1d946967778d29df5399d5ad222ff7a2ee2b8..492fa01ec2d3fe25331949e10120860cb379a5c7 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -29,7 +29,6 @@ struct cgroup_subsys;
  struct inode;
  struct cgroup;
  struct css_id;
-struct eventfd_ctx;
  
  extern int cgroup_init_early(void);
  extern int cgroup_init(void);
@@ -239,10 +238,6 @@ struct cgroup {
         struct rcu_head rcu_head;
         struct work_struct destroy_work;
  
-       /* List of events which userspace want to receive */
-       struct list_head event_list;
-       spinlock_t event_list_lock;
-
         /* directory xattrs */
         struct simple_xattrs xattrs;
  };
@@ -506,25 +501,6 @@ struct cftype {
         int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
  
         int (*release)(struct inode *inode, struct file *file);
-
-       /*
-        * register_event() callback will be used to add new userspace
-        * waiter for changes related to the cftype. Implement it if
-        * you want to provide this functionality. Use eventfd_signal()
-        * on eventfd to send notification to userspace.
-        */
-       int (*register_event)(struct cgroup_subsys_state *css,
-                             struct cftype *cft, struct eventfd_ctx *eventfd,
-                             const char *args);
-       /*
-        * unregister_event() callback will be called when userspace
-        * closes the eventfd or on cgroup removing.
-        * This callback must be implemented, if you want provide
-        * notification functionality.
-        */
-       void (*unregister_event)(struct cgroup_subsys_state *css,
-                                struct cftype *cft,
-                                struct eventfd_ctx *eventfd);
  };
  
  /*
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h

index 3f3788d4936292ce150a6cb4b56036a58c541128..3e4535876d37493e590d43a687aa980646dab0c4 100644 (file)
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -7,6 +7,7 @@
  #include <linux/gfp.h>
  #include <linux/types.h>
  #include <linux/cgroup.h>
+#include <linux/eventfd.h>
  
  struct vmpressure {
         unsigned long scanned;
@@ -33,13 +34,10 @@ extern void vmpressure_init(struct vmpressure *vmpr);
  extern void vmpressure_cleanup(struct vmpressure *vmpr);
  extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
  extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
-extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css);
-extern int vmpressure_register_event(struct cgroup_subsys_state *css,
-                                    struct cftype *cft,
+extern int vmpressure_register_event(struct mem_cgroup *memcg,
                                      struct eventfd_ctx *eventfd,
                                      const char *args);
-extern void vmpressure_unregister_event(struct cgroup_subsys_state *css,
-                                       struct cftype *cft,
+extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
                                         struct eventfd_ctx *eventfd);
  #else
  static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
diff --git a/init/Kconfig b/init/Kconfig

index 79383d3aa5dc5f7fe64abdcaf7d511144ca016e5..93f344337172c44bc7d43fcf3a5241d1eca923c8 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -848,7 +848,6 @@ config NUMA_BALANCING
  
  menuconfig CGROUPS
         boolean "Control Group support"
-       depends on EVENTFD
         help
           This option adds support for grouping sets of processes together, for
           use with process control subsystems such as Cpusets, CFS, memory
@@ -915,6 +914,7 @@ config MEMCG
         bool "Memory Resource Controller for Control Groups"
         depends on RESOURCE_COUNTERS
         select MM_OWNER
+       select EVENTFD
         help
           Provides a memory resource controller that manages both anonymous
           memory and page cache. (See Documentation/cgroups/memory.txt)
@@ -1154,7 +1154,6 @@ config UIDGID_STRICT_TYPE_CHECKS
  
  config SCHED_AUTOGROUP
         bool "Automatic process group scheduling"
-       select EVENTFD
         select CGROUPS
         select CGROUP_SCHED
         select FAIR_GROUP_SCHED
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index a7b98ee35ef7249c886d9ca77e729ef9cf680bef..be42967f4f1afb98fd3ee0dbf94a38934395c583 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -56,11 +56,8 @@
  #include <linux/pid_namespace.h>
  #include <linux/idr.h>
  #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
-#include <linux/eventfd.h>
-#include <linux/poll.h>
  #include <linux/flex_array.h> /* used in cgroup_attach_task */
  #include <linux/kthread.h>
-#include <linux/file.h>
  
  #include <linux/atomic.h>
  
@@ -132,36 +129,6 @@ struct cfent {
         struct simple_xattrs            xattrs;
  };
  
-/*
- * cgroup_event represents events which userspace want to receive.
- */
-struct cgroup_event {
-       /*
-        * css which the event belongs to.
-        */
-       struct cgroup_subsys_state *css;
-       /*
-        * Control file which the event associated.
-        */
-       struct cftype *cft;
-       /*
-        * eventfd to signal userspace about the event.
-        */
-       struct eventfd_ctx *eventfd;
-       /*
-        * Each of these stored in a list by the cgroup.
-        */
-       struct list_head list;
-       /*
-        * All fields below needed to unregister event when
-        * userspace closes eventfd.
-        */
-       poll_table pt;
-       wait_queue_head_t *wqh;
-       wait_queue_t wait;
-       struct work_struct remove;
-};
-
  /* The list of hierarchy roots */
  
  static LIST_HEAD(cgroup_roots);
@@ -1351,8 +1318,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
         INIT_LIST_HEAD(&cgrp->pidlists);
         mutex_init(&cgrp->pidlist_mutex);
         cgrp->dummy_css.cgroup = cgrp;
-       INIT_LIST_HEAD(&cgrp->event_list);
-       spin_lock_init(&cgrp->event_list_lock);
         simple_xattrs_init(&cgrp->xattrs);
  }
  
@@ -2626,16 +2591,6 @@ static const struct inode_operations cgroup_dir_inode_operations = {
         .removexattr = cgroup_removexattr,
  };
  
-/*
- * Check if a file is a control file
- */
-static inline struct cftype *__file_cft(struct file *file)
-{
-       if (file_inode(file)->i_fop != &cgroup_file_operations)
-               return ERR_PTR(-EINVAL);
-       return __d_cft(file->f_dentry);
-}
-
  static int cgroup_create_file(struct dentry *dentry, umode_t mode,
                                 struct super_block *sb)
  {
@@ -3915,202 +3870,6 @@ static void cgroup_dput(struct cgroup *cgrp)
         deactivate_super(sb);
  }
  
-/*
- * Unregister event and free resources.
- *
- * Gets called from workqueue.
- */
-static void cgroup_event_remove(struct work_struct *work)
-{
-       struct cgroup_event *event = container_of(work, struct cgroup_event,
-                       remove);
-       struct cgroup_subsys_state *css = event->css;
-
-       remove_wait_queue(event->wqh, &event->wait);
-
-       event->cft->unregister_event(css, event->cft, event->eventfd);
-
-       /* Notify userspace the event is going away. */
-       eventfd_signal(event->eventfd, 1);
-
-       eventfd_ctx_put(event->eventfd);
-       kfree(event);
-       css_put(css);
-}
-
-/*
- * Gets called on POLLHUP on eventfd when user closes it.
- *
- * Called with wqh->lock held and interrupts disabled.
- */
-static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
-               int sync, void *key)
-{
-       struct cgroup_event *event = container_of(wait,
-                       struct cgroup_event, wait);
-       struct cgroup *cgrp = event->css->cgroup;
-       unsigned long flags = (unsigned long)key;
-
-       if (flags & POLLHUP) {
-               /*
-                * If the event has been detached at cgroup removal, we
-                * can simply return knowing the other side will cleanup
-                * for us.
-                *
-                * We can't race against event freeing since the other
-                * side will require wqh->lock via remove_wait_queue(),
-                * which we hold.
-                */
-               spin_lock(&cgrp->event_list_lock);
-               if (!list_empty(&event->list)) {
-                       list_del_init(&event->list);
-                       /*
-                        * We are in atomic context, but cgroup_event_remove()
-                        * may sleep, so we have to call it in workqueue.
-                        */
-                       schedule_work(&event->remove);
-               }
-               spin_unlock(&cgrp->event_list_lock);
-       }
-
-       return 0;
-}
-
-static void cgroup_event_ptable_queue_proc(struct file *file,
-               wait_queue_head_t *wqh, poll_table *pt)
-{
-       struct cgroup_event *event = container_of(pt,
-                       struct cgroup_event, pt);
-
-       event->wqh = wqh;
-       add_wait_queue(wqh, &event->wait);
-}
-
-/*
- * Parse input and register new cgroup event handler.
- *
- * Input must be in format '<event_fd> <control_fd> <args>'.
- * Interpretation of args is defined by control file implementation.
- */
-static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
-                                     struct cftype *cft, const char *buffer)
-{
-       struct cgroup *cgrp = dummy_css->cgroup;
-       struct cgroup_event *event;
-       struct cgroup_subsys_state *cfile_css;
-       unsigned int efd, cfd;
-       struct fd efile;
-       struct fd cfile;
-       char *endp;
-       int ret;
-
-       efd = simple_strtoul(buffer, &endp, 10);
-       if (*endp != ' ')
-               return -EINVAL;
-       buffer = endp + 1;
-
-       cfd = simple_strtoul(buffer, &endp, 10);
-       if ((*endp != ' ') && (*endp != '\0'))
-               return -EINVAL;
-       buffer = endp + 1;
-
-       event = kzalloc(sizeof(*event), GFP_KERNEL);
-       if (!event)
-               return -ENOMEM;
-
-       INIT_LIST_HEAD(&event->list);
-       init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
-       init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
-       INIT_WORK(&event->remove, cgroup_event_remove);
-
-       efile = fdget(efd);
-       if (!efile.file) {
-               ret = -EBADF;
-               goto out_kfree;
-       }
-
-       event->eventfd = eventfd_ctx_fileget(efile.file);
-       if (IS_ERR(event->eventfd)) {
-               ret = PTR_ERR(event->eventfd);
-               goto out_put_efile;
-       }
-
-       cfile = fdget(cfd);
-       if (!cfile.file) {
-               ret = -EBADF;
-               goto out_put_eventfd;
-       }
-
-       /* the process need read permission on control file */
-       /* AV: shouldn't we check that it's been opened for read instead? */
-       ret = inode_permission(file_inode(cfile.file), MAY_READ);
-       if (ret < 0)
-               goto out_put_cfile;
-
-       event->cft = __file_cft(cfile.file);
-       if (IS_ERR(event->cft)) {
-               ret = PTR_ERR(event->cft);
-               goto out_put_cfile;
-       }
-
-       if (!event->cft->ss) {
-               ret = -EBADF;
-               goto out_put_cfile;
-       }
-
-       /*
-        * Determine the css of @cfile, verify it belongs to the same
-        * cgroup as cgroup.event_control, and associate @event with it.
-        * Remaining events are automatically removed on cgroup destruction
-        * but the removal is asynchronous, so take an extra ref.
-        */
-       rcu_read_lock();
-
-       ret = -EINVAL;
-       event->css = cgroup_css(cgrp, event->cft->ss);
-       cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
-       if (event->css && event->css == cfile_css && css_tryget(event->css))
-               ret = 0;
-
-       rcu_read_unlock();
-       if (ret)
-               goto out_put_cfile;
-
-       if (!event->cft->register_event || !event->cft->unregister_event) {
-               ret = -EINVAL;
-               goto out_put_css;
-       }
-
-       ret = event->cft->register_event(event->css, event->cft,
-                       event->eventfd, buffer);
-       if (ret)
-               goto out_put_css;
-
-       efile.file->f_op->poll(efile.file, &event->pt);
-
-       spin_lock(&cgrp->event_list_lock);
-       list_add(&event->list, &cgrp->event_list);
-       spin_unlock(&cgrp->event_list_lock);
-
-       fdput(cfile);
-       fdput(efile);
-
-       return 0;
-
-out_put_css:
-       css_put(event->css);
-out_put_cfile:
-       fdput(cfile);
-out_put_eventfd:
-       eventfd_ctx_put(event->eventfd);
-out_put_efile:
-       fdput(efile);
-out_kfree:
-       kfree(event);
-
-       return ret;
-}
-
  static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
                                       struct cftype *cft)
  {
@@ -4135,11 +3894,6 @@ static struct cftype cgroup_base_files[] = {
                 .release = cgroup_pidlist_release,
                 .mode = S_IRUGO | S_IWUSR,
         },
-       {
-               .name = "cgroup.event_control",
-               .write_string = cgroup_write_event_control,
-               .mode = S_IWUGO,
-       },
         {
                 .name = "cgroup.clone_children",
                 .flags = CFTYPE_INSANE,
@@ -4610,7 +4364,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
  {
         struct dentry *d = cgrp->dentry;
-       struct cgroup_event *event, *tmp;
         struct cgroup_subsys *ss;
         struct cgroup *child;
         bool empty;
@@ -4685,18 +4438,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         dget(d);
         cgroup_d_remove_dir(d);
  
-       /*
-        * Unregister events and notify userspace.
-        * Notify userspace about cgroup removing only after rmdir of cgroup
-        * directory to avoid race between userspace and kernelspace.
-        */
-       spin_lock(&cgrp->event_list_lock);
-       list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
-               list_del_init(&event->list);
-               schedule_work(&event->remove);
-       }
-       spin_unlock(&cgrp->event_list_lock);
-
         return 0;
  };
  
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index f1a0ae6e11b86b3020c90d7241ba12d47d2bbaa8..7aa0d405b14865fcffbc484dddf5703ffadd0ab5 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,6 +45,7 @@
  #include <linux/swapops.h>
  #include <linux/spinlock.h>
  #include <linux/eventfd.h>
+#include <linux/poll.h>
  #include <linux/sort.h>
  #include <linux/fs.h>
  #include <linux/seq_file.h>
@@ -55,6 +56,7 @@
  #include <linux/cpu.h>
  #include <linux/oom.h>
  #include <linux/lockdep.h>
+#include <linux/file.h>
  #include "internal.h"
  #include <net/sock.h>
  #include <net/ip.h>
@@ -227,6 +229,46 @@ struct mem_cgroup_eventfd_list {
         struct eventfd_ctx *eventfd;
  };
  
+/*
+ * cgroup_event represents events which userspace want to receive.
+ */
+struct mem_cgroup_event {
+       /*
+        * memcg which the event belongs to.
+        */
+       struct mem_cgroup *memcg;
+       /*
+        * eventfd to signal userspace about the event.
+        */
+       struct eventfd_ctx *eventfd;
+       /*
+        * Each of these stored in a list by the cgroup.
+        */
+       struct list_head list;
+       /*
+        * register_event() callback will be used to add new userspace
+        * waiter for changes related to this event.  Use eventfd_signal()
+        * on eventfd to send notification to userspace.
+        */
+       int (*register_event)(struct mem_cgroup *memcg,
+                             struct eventfd_ctx *eventfd, const char *args);
+       /*
+        * unregister_event() callback will be called when userspace closes
+        * the eventfd or on cgroup removing.  This callback must be set,
+        * if you want provide notification functionality.
+        */
+       void (*unregister_event)(struct mem_cgroup *memcg,
+                                struct eventfd_ctx *eventfd);
+       /*
+        * All fields below needed to unregister event when
+        * userspace closes eventfd.
+        */
+       poll_table pt;
+       wait_queue_head_t *wqh;
+       wait_queue_t wait;
+       struct work_struct remove;
+};
+
  static void mem_cgroup_threshold(struct mem_cgroup *memcg);
  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
  
@@ -331,6 +373,10 @@ struct mem_cgroup {
         atomic_t        numainfo_updating;
  #endif
  
+       /* List of events which userspace want to receive */
+       struct list_head event_list;
+       spinlock_t event_list_lock;
+
         struct mem_cgroup_per_node *nodeinfo[0];
         /* WARNING: nodeinfo must be the last member here */
  };
@@ -490,11 +536,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
         return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
  }
  
-struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
-{
-       return &mem_cgroup_from_css(css)->vmpressure;
-}
-
  static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
  {
         return (memcg == root_mem_cgroup);
@@ -5648,13 +5689,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
                 mem_cgroup_oom_notify_cb(iter);
  }
  
-static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
-       struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, const char *args, enum res_type type)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup_thresholds *thresholds;
         struct mem_cgroup_threshold_ary *new;
-       enum res_type type = MEMFILE_TYPE(cft->private);
         u64 threshold, usage;
         int i, size, ret;
  
@@ -5731,13 +5770,23 @@ unlock:
         return ret;
  }
  
-static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
-       struct cftype *cft, struct eventfd_ctx *eventfd)
+static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, const char *args)
+{
+       return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+}
+
+static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, const char *args)
+{
+       return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+}
+
+static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, enum res_type type)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup_thresholds *thresholds;
         struct mem_cgroup_threshold_ary *new;
-       enum res_type type = MEMFILE_TYPE(cft->private);
         u64 usage;
         int i, j, size;
  
@@ -5810,14 +5859,23 @@ unlock:
         mutex_unlock(&memcg->thresholds_lock);
  }
  
-static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
-       struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd)
+{
+       return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+}
+
+static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd)
+{
+       return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+}
+
+static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, const char *args)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup_eventfd_list *event;
-       enum res_type type = MEMFILE_TYPE(cft->private);
  
-       BUG_ON(type != _OOM_TYPE);
         event = kmalloc(sizeof(*event), GFP_KERNEL);
         if (!event)
                 return -ENOMEM;
@@ -5835,14 +5893,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
         return 0;
  }
  
-static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
-       struct cftype *cft, struct eventfd_ctx *eventfd)
+static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd)
  {
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup_eventfd_list *ev, *tmp;
-       enum res_type type = MEMFILE_TYPE(cft->private);
-
-       BUG_ON(type != _OOM_TYPE);
  
         spin_lock(&memcg_oom_lock);
  
@@ -5959,13 +6013,233 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
  }
  #endif
  
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * "cgroup.event_control" implementation.
+ *
+ * This is way over-engineered.  It tries to support fully configurable
+ * events for each user.  Such level of flexibility is completely
+ * unnecessary especially in the light of the planned unified hierarchy.
+ *
+ * Please deprecate this and replace with something simpler if at all
+ * possible.
+ */
+
+/*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void memcg_event_remove(struct work_struct *work)
+{
+       struct mem_cgroup_event *event =
+               container_of(work, struct mem_cgroup_event, remove);
+       struct mem_cgroup *memcg = event->memcg;
+
+       remove_wait_queue(event->wqh, &event->wait);
+
+       event->unregister_event(memcg, event->eventfd);
+
+       /* Notify userspace the event is going away. */
+       eventfd_signal(event->eventfd, 1);
+
+       eventfd_ctx_put(event->eventfd);
+       kfree(event);
+       css_put(&memcg->css);
+}
+
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+                           int sync, void *key)
+{
+       struct mem_cgroup_event *event =
+               container_of(wait, struct mem_cgroup_event, wait);
+       struct mem_cgroup *memcg = event->memcg;
+       unsigned long flags = (unsigned long)key;
+
+       if (flags & POLLHUP) {
+               /*
+                * If the event has been detached at cgroup removal, we
+                * can simply return knowing the other side will cleanup
+                * for us.
+                *
+                * We can't race against event freeing since the other
+                * side will require wqh->lock via remove_wait_queue(),
+                * which we hold.
+                */
+               spin_lock(&memcg->event_list_lock);
+               if (!list_empty(&event->list)) {
+                       list_del_init(&event->list);
+                       /*
+                        * We are in atomic context, but cgroup_event_remove()
+                        * may sleep, so we have to call it in workqueue.
+                        */
+                       schedule_work(&event->remove);
+               }
+               spin_unlock(&memcg->event_list_lock);
+       }
+
+       return 0;
+}
+
+static void memcg_event_ptable_queue_proc(struct file *file,
+               wait_queue_head_t *wqh, poll_table *pt)
+{
+       struct mem_cgroup_event *event =
+               container_of(pt, struct mem_cgroup_event, pt);
+
+       event->wqh = wqh;
+       add_wait_queue(wqh, &event->wait);
+}
+
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static int memcg_write_event_control(struct cgroup_subsys_state *css,
+                                    struct cftype *cft, const char *buffer)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       struct mem_cgroup_event *event;
+       struct cgroup_subsys_state *cfile_css;
+       unsigned int efd, cfd;
+       struct fd efile;
+       struct fd cfile;
+       const char *name;
+       char *endp;
+       int ret;
+
+       efd = simple_strtoul(buffer, &endp, 10);
+       if (*endp != ' ')
+               return -EINVAL;
+       buffer = endp + 1;
+
+       cfd = simple_strtoul(buffer, &endp, 10);
+       if ((*endp != ' ') && (*endp != '\0'))
+               return -EINVAL;
+       buffer = endp + 1;
+
+       event = kzalloc(sizeof(*event), GFP_KERNEL);
+       if (!event)
+               return -ENOMEM;
+
+       event->memcg = memcg;
+       INIT_LIST_HEAD(&event->list);
+       init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
+       init_waitqueue_func_entry(&event->wait, memcg_event_wake);
+       INIT_WORK(&event->remove, memcg_event_remove);
+
+       efile = fdget(efd);
+       if (!efile.file) {
+               ret = -EBADF;
+               goto out_kfree;
+       }
+
+       event->eventfd = eventfd_ctx_fileget(efile.file);
+       if (IS_ERR(event->eventfd)) {
+               ret = PTR_ERR(event->eventfd);
+               goto out_put_efile;
+       }
+
+       cfile = fdget(cfd);
+       if (!cfile.file) {
+               ret = -EBADF;
+               goto out_put_eventfd;
+       }
+
+       /* the process need read permission on control file */
+       /* AV: shouldn't we check that it's been opened for read instead? */
+       ret = inode_permission(file_inode(cfile.file), MAY_READ);
+       if (ret < 0)
+               goto out_put_cfile;
+
+       /*
+        * Determine the event callbacks and set them in @event.  This used
+        * to be done via struct cftype but cgroup core no longer knows
+        * about these events.  The following is crude but the whole thing
+        * is for compatibility anyway.
+        *
+        * DO NOT ADD NEW FILES.
+        */
+       name = cfile.file->f_dentry->d_name.name;
+
+       if (!strcmp(name, "memory.usage_in_bytes")) {
+               event->register_event = mem_cgroup_usage_register_event;
+               event->unregister_event = mem_cgroup_usage_unregister_event;
+       } else if (!strcmp(name, "memory.oom_control")) {
+               event->register_event = mem_cgroup_oom_register_event;
+               event->unregister_event = mem_cgroup_oom_unregister_event;
+       } else if (!strcmp(name, "memory.pressure_level")) {
+               event->register_event = vmpressure_register_event;
+               event->unregister_event = vmpressure_unregister_event;
+       } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
+               event->register_event = memsw_cgroup_usage_register_event;
+               event->unregister_event = memsw_cgroup_usage_unregister_event;
+       } else {
+               ret = -EINVAL;
+               goto out_put_cfile;
+       }
+
+       /*
+        * Verify @cfile should belong to @css.  Also, remaining events are
+        * automatically removed on cgroup destruction but the removal is
+        * asynchronous, so take an extra ref on @css.
+        */
+       rcu_read_lock();
+
+       ret = -EINVAL;
+       cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
+                                &mem_cgroup_subsys);
+       if (cfile_css == css && css_tryget(css))
+               ret = 0;
+
+       rcu_read_unlock();
+       if (ret)
+               goto out_put_cfile;
+
+       ret = event->register_event(memcg, event->eventfd, buffer);
+       if (ret)
+               goto out_put_css;
+
+       efile.file->f_op->poll(efile.file, &event->pt);
+
+       spin_lock(&memcg->event_list_lock);
+       list_add(&event->list, &memcg->event_list);
+       spin_unlock(&memcg->event_list_lock);
+
+       fdput(cfile);
+       fdput(efile);
+
+       return 0;
+
+out_put_css:
+       css_put(css);
+out_put_cfile:
+       fdput(cfile);
+out_put_eventfd:
+       eventfd_ctx_put(event->eventfd);
+out_put_efile:
+       fdput(efile);
+out_kfree:
+       kfree(event);
+
+       return ret;
+}
+
  static struct cftype mem_cgroup_files[] = {
         {
                 .name = "usage_in_bytes",
                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
                 .read = mem_cgroup_read,
-               .register_event = mem_cgroup_usage_register_event,
-               .unregister_event = mem_cgroup_usage_unregister_event,
         },
         {
                 .name = "max_usage_in_bytes",
@@ -6005,6 +6279,12 @@ static struct cftype mem_cgroup_files[] = {
                 .write_u64 = mem_cgroup_hierarchy_write,
                 .read_u64 = mem_cgroup_hierarchy_read,
         },
+       {
+               .name = "cgroup.event_control",         /* XXX: for compat */
+               .write_string = memcg_write_event_control,
+               .flags = CFTYPE_NO_PREFIX,
+               .mode = S_IWUGO,
+       },
         {
                 .name = "swappiness",
                 .read_u64 = mem_cgroup_swappiness_read,
@@ -6019,14 +6299,10 @@ static struct cftype mem_cgroup_files[] = {
                 .name = "oom_control",
                 .read_map = mem_cgroup_oom_control_read,
                 .write_u64 = mem_cgroup_oom_control_write,
-               .register_event = mem_cgroup_oom_register_event,
-               .unregister_event = mem_cgroup_oom_unregister_event,
                 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
         },
         {
                 .name = "pressure_level",
-               .register_event = vmpressure_register_event,
-               .unregister_event = vmpressure_unregister_event,
         },
  #ifdef CONFIG_NUMA
         {
@@ -6074,8 +6350,6 @@ static struct cftype memsw_cgroup_files[] = {
                 .name = "memsw.usage_in_bytes",
                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
                 .read = mem_cgroup_read,
-               .register_event = mem_cgroup_usage_register_event,
-               .unregister_event = mem_cgroup_usage_unregister_event,
         },
         {
                 .name = "memsw.max_usage_in_bytes",
@@ -6265,6 +6539,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
         mutex_init(&memcg->thresholds_lock);
         spin_lock_init(&memcg->move_lock);
         vmpressure_init(&memcg->vmpressure);
+       INIT_LIST_HEAD(&memcg->event_list);
+       spin_lock_init(&memcg->event_list_lock);
  
         return &memcg->css;
  
@@ -6340,6 +6616,19 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
  static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       struct mem_cgroup_event *event, *tmp;
+
+       /*
+        * Unregister events and notify userspace.
+        * Notify userspace about cgroup removing only after rmdir of cgroup
+        * directory to avoid race between userspace and kernelspace.
+        */
+       spin_lock(&memcg->event_list_lock);
+       list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
+               list_del_init(&event->list);
+               schedule_work(&event->remove);
+       }
+       spin_unlock(&memcg->event_list_lock);
  
         kmem_cgroup_css_offline(memcg);
  
diff --git a/mm/vmpressure.c b/mm/vmpressure.c

index e0f62837c3f4873ea2d54c14845c30cb888a386b..196970a4541f0c07108eff49b7cb8d12fd930b06 100644 (file)
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -278,8 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
  
  /**
   * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
- * @css:       css that is interested in vmpressure notifications
- * @cft:       cgroup control files handle
+ * @memcg:     memcg that is interested in vmpressure notifications
   * @eventfd:   eventfd context to link notifications with
   * @args:      event arguments (used to set up a pressure level threshold)
   *
@@ -289,15 +288,12 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
   * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
   * "critical").
   *
- * This function should not be used directly, just pass it to (struct
- * cftype).register_event, and then cgroup core will handle everything by
- * itself.
+ * To be used as memcg event method.
   */
-int vmpressure_register_event(struct cgroup_subsys_state *css,
-                             struct cftype *cft, struct eventfd_ctx *eventfd,
-                             const char *args)
+int vmpressure_register_event(struct mem_cgroup *memcg,
+                             struct eventfd_ctx *eventfd, const char *args)
  {
-       struct vmpressure *vmpr = css_to_vmpressure(css);
+       struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
         struct vmpressure_event *ev;
         int level;
  
@@ -325,23 +321,19 @@ int vmpressure_register_event(struct cgroup_subsys_state *css,
  
  /**
   * vmpressure_unregister_event() - Unbind eventfd from vmpressure
- * @css:       css handle
- * @cft:       cgroup control files handle
+ * @memcg:     memcg handle
   * @eventfd:   eventfd context that was used to link vmpressure with the @cg
   *
   * This function does internal manipulations to detach the @eventfd from
   * the vmpressure notifications, and then frees internal resources
   * associated with the @eventfd (but the @eventfd itself is not freed).
   *
- * This function should not be used directly, just pass it to (struct
- * cftype).unregister_event, and then cgroup core will handle everything
- * by itself.
+ * To be used as memcg event method.
   */
-void vmpressure_unregister_event(struct cgroup_subsys_state *css,
-                                struct cftype *cft,
+void vmpressure_unregister_event(struct mem_cgroup *memcg,
                                  struct eventfd_ctx *eventfd)
  {
-       struct vmpressure *vmpr = css_to_vmpressure(css);
+       struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
         struct vmpressure_event *ev;
  
         mutex_lock(&vmpr->events_lock);
author	Tejun Heo <tj@kernel.org>
	Fri, 22 Nov 2013 23:32:25 +0000 (18:32 -0500)
committer	Tejun Heo <tj@kernel.org>
	Fri, 22 Nov 2013 23:32:25 +0000 (18:32 -0500)
Documentation/cgroups/cgroups.txt		patch \| blob \| history
include/linux/cgroup.h		patch \| blob \| history
include/linux/vmpressure.h		patch \| blob \| history
init/Kconfig		patch \| blob \| history
kernel/cgroup.c		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/vmpressure.c		patch \| blob \| history