cgroup, memcg: move cgroup->event_list[_lock] and event callbacks into memcg

[~andy/linux] / kernel / cgroup.c
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index 2418b6e71a854e187573ec12746481ce863e721a..feda7c54fa6b1f74ad7aeae56bf41f998a51f72c 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -56,11 +56,8 @@
  #include <linux/pid_namespace.h>
  #include <linux/idr.h>
  #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
-#include <linux/eventfd.h>
-#include <linux/poll.h>
  #include <linux/flex_array.h> /* used in cgroup_attach_task */
  #include <linux/kthread.h>
-#include <linux/file.h>
  
  #include <linux/atomic.h>
  
@@ -156,36 +153,6 @@ struct css_id {
         unsigned short stack[0]; /* Array of Length (depth+1) */
  };
  
-/*
- * cgroup_event represents events which userspace want to receive.
- */
-struct cgroup_event {
-       /*
-        * css which the event belongs to.
-        */
-       struct cgroup_subsys_state *css;
-       /*
-        * Control file which the event associated.
-        */
-       struct cftype *cft;
-       /*
-        * eventfd to signal userspace about the event.
-        */
-       struct eventfd_ctx *eventfd;
-       /*
-        * Each of these stored in a list by the cgroup.
-        */
-       struct list_head list;
-       /*
-        * All fields below needed to unregister event when
-        * userspace closes eventfd.
-        */
-       poll_table pt;
-       wait_queue_head_t *wqh;
-       wait_queue_t wait;
-       struct work_struct remove;
-};
-
  /* The list of hierarchy roots */
  
  static LIST_HEAD(cgroup_roots);
@@ -235,8 +202,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
   * keep accessing it outside the said locks.  This function may return
   * %NULL if @cgrp doesn't have @subsys_id enabled.
   */
-static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
-                                             struct cgroup_subsys *ss)
+struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
+                                      struct cgroup_subsys *ss)
  {
         if (ss)
                 return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
@@ -1385,8 +1352,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
         INIT_LIST_HEAD(&cgrp->pidlists);
         mutex_init(&cgrp->pidlist_mutex);
         cgrp->dummy_css.cgroup = cgrp;
-       INIT_LIST_HEAD(&cgrp->event_list);
-       spin_lock_init(&cgrp->event_list_lock);
         simple_xattrs_init(&cgrp->xattrs);
  }
  
@@ -2039,7 +2004,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
  
                 /* @tsk either already exited or can't exit until the end */
                 if (tsk->flags & PF_EXITING)
-                       continue;
+                       goto next;
  
                 /* as per above, nr_threads may decrease, but not increase. */
                 BUG_ON(i >= group_size);
@@ -2047,7 +2012,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
                 ent.cgrp = task_cgroup_from_root(tsk, root);
                 /* nothing to do if this task is already in the cgroup */
                 if (ent.cgrp == cgrp)
-                       continue;
+                       goto next;
                 /*
                  * saying GFP_ATOMIC has no effect here because we did prealloc
                  * earlier, but it's good form to communicate our expectations.
@@ -2055,7 +2020,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
                 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
                 BUG_ON(retval != 0);
                 i++;
-
+       next:
                 if (!threadgroup)
                         break;
         } while_each_thread(leader, tsk);
@@ -2663,7 +2628,7 @@ static const struct inode_operations cgroup_dir_inode_operations = {
  /*
   * Check if a file is a control file
   */
-static inline struct cftype *__file_cft(struct file *file)
+struct cftype *__file_cft(struct file *file)
  {
         if (file_inode(file)->i_fop != &cgroup_file_operations)
                 return ERR_PTR(-EINVAL);
@@ -3188,11 +3153,9 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
  
         WARN_ON_ONCE(!rcu_read_lock_held());
  
-       /* if first iteration, visit the leftmost descendant */
-       if (!pos) {
-               next = css_leftmost_descendant(root);
-               return next != root ? next : NULL;
-       }
+       /* if first iteration, visit leftmost descendant which may be @root */
+       if (!pos)
+               return css_leftmost_descendant(root);
  
         /* if we visited @root, we're done */
         if (pos == root)
@@ -3951,202 +3914,6 @@ static void cgroup_dput(struct cgroup *cgrp)
         deactivate_super(sb);
  }
  
-/*
- * Unregister event and free resources.
- *
- * Gets called from workqueue.
- */
-static void cgroup_event_remove(struct work_struct *work)
-{
-       struct cgroup_event *event = container_of(work, struct cgroup_event,
-                       remove);
-       struct cgroup_subsys_state *css = event->css;
-
-       remove_wait_queue(event->wqh, &event->wait);
-
-       event->cft->unregister_event(css, event->cft, event->eventfd);
-
-       /* Notify userspace the event is going away. */
-       eventfd_signal(event->eventfd, 1);
-
-       eventfd_ctx_put(event->eventfd);
-       kfree(event);
-       css_put(css);
-}
-
-/*
- * Gets called on POLLHUP on eventfd when user closes it.
- *
- * Called with wqh->lock held and interrupts disabled.
- */
-static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
-               int sync, void *key)
-{
-       struct cgroup_event *event = container_of(wait,
-                       struct cgroup_event, wait);
-       struct cgroup *cgrp = event->css->cgroup;
-       unsigned long flags = (unsigned long)key;
-
-       if (flags & POLLHUP) {
-               /*
-                * If the event has been detached at cgroup removal, we
-                * can simply return knowing the other side will cleanup
-                * for us.
-                *
-                * We can't race against event freeing since the other
-                * side will require wqh->lock via remove_wait_queue(),
-                * which we hold.
-                */
-               spin_lock(&cgrp->event_list_lock);
-               if (!list_empty(&event->list)) {
-                       list_del_init(&event->list);
-                       /*
-                        * We are in atomic context, but cgroup_event_remove()
-                        * may sleep, so we have to call it in workqueue.
-                        */
-                       schedule_work(&event->remove);
-               }
-               spin_unlock(&cgrp->event_list_lock);
-       }
-
-       return 0;
-}
-
-static void cgroup_event_ptable_queue_proc(struct file *file,
-               wait_queue_head_t *wqh, poll_table *pt)
-{
-       struct cgroup_event *event = container_of(pt,
-                       struct cgroup_event, pt);
-
-       event->wqh = wqh;
-       add_wait_queue(wqh, &event->wait);
-}
-
-/*
- * Parse input and register new cgroup event handler.
- *
- * Input must be in format '<event_fd> <control_fd> <args>'.
- * Interpretation of args is defined by control file implementation.
- */
-static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
-                                     struct cftype *cft, const char *buffer)
-{
-       struct cgroup *cgrp = dummy_css->cgroup;
-       struct cgroup_event *event;
-       struct cgroup_subsys_state *cfile_css;
-       unsigned int efd, cfd;
-       struct fd efile;
-       struct fd cfile;
-       char *endp;
-       int ret;
-
-       efd = simple_strtoul(buffer, &endp, 10);
-       if (*endp != ' ')
-               return -EINVAL;
-       buffer = endp + 1;
-
-       cfd = simple_strtoul(buffer, &endp, 10);
-       if ((*endp != ' ') && (*endp != '\0'))
-               return -EINVAL;
-       buffer = endp + 1;
-
-       event = kzalloc(sizeof(*event), GFP_KERNEL);
-       if (!event)
-               return -ENOMEM;
-
-       INIT_LIST_HEAD(&event->list);
-       init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
-       init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
-       INIT_WORK(&event->remove, cgroup_event_remove);
-
-       efile = fdget(efd);
-       if (!efile.file) {
-               ret = -EBADF;
-               goto out_kfree;
-       }
-
-       event->eventfd = eventfd_ctx_fileget(efile.file);
-       if (IS_ERR(event->eventfd)) {
-               ret = PTR_ERR(event->eventfd);
-               goto out_put_efile;
-       }
-
-       cfile = fdget(cfd);
-       if (!cfile.file) {
-               ret = -EBADF;
-               goto out_put_eventfd;
-       }
-
-       /* the process need read permission on control file */
-       /* AV: shouldn't we check that it's been opened for read instead? */
-       ret = inode_permission(file_inode(cfile.file), MAY_READ);
-       if (ret < 0)
-               goto out_put_cfile;
-
-       event->cft = __file_cft(cfile.file);
-       if (IS_ERR(event->cft)) {
-               ret = PTR_ERR(event->cft);
-               goto out_put_cfile;
-       }
-
-       if (!event->cft->ss) {
-               ret = -EBADF;
-               goto out_put_cfile;
-       }
-
-       /*
-        * Determine the css of @cfile, verify it belongs to the same
-        * cgroup as cgroup.event_control, and associate @event with it.
-        * Remaining events are automatically removed on cgroup destruction
-        * but the removal is asynchronous, so take an extra ref.
-        */
-       rcu_read_lock();
-
-       ret = -EINVAL;
-       event->css = cgroup_css(cgrp, event->cft->ss);
-       cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
-       if (event->css && event->css == cfile_css && css_tryget(event->css))
-               ret = 0;
-
-       rcu_read_unlock();
-       if (ret)
-               goto out_put_cfile;
-
-       if (!event->cft->register_event || !event->cft->unregister_event) {
-               ret = -EINVAL;
-               goto out_put_css;
-       }
-
-       ret = event->cft->register_event(event->css, event->cft,
-                       event->eventfd, buffer);
-       if (ret)
-               goto out_put_css;
-
-       efile.file->f_op->poll(efile.file, &event->pt);
-
-       spin_lock(&cgrp->event_list_lock);
-       list_add(&event->list, &cgrp->event_list);
-       spin_unlock(&cgrp->event_list_lock);
-
-       fdput(cfile);
-       fdput(efile);
-
-       return 0;
-
-out_put_css:
-       css_put(event->css);
-out_put_cfile:
-       fdput(cfile);
-out_put_eventfd:
-       eventfd_ctx_put(event->eventfd);
-out_put_efile:
-       fdput(efile);
-out_kfree:
-       kfree(event);
-
-       return ret;
-}
-
  static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
                                       struct cftype *cft)
  {
@@ -4171,11 +3938,6 @@ static struct cftype cgroup_base_files[] = {
                 .release = cgroup_pidlist_release,
                 .mode = S_IRUGO | S_IWUSR,
         },
-       {
-               .name = "cgroup.event_control",
-               .write_string = cgroup_write_event_control,
-               .mode = S_IWUGO,
-       },
         {
                 .name = "cgroup.clone_children",
                 .flags = CFTYPE_INSANE,
@@ -4668,7 +4430,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
  {
         struct dentry *d = cgrp->dentry;
-       struct cgroup_event *event, *tmp;
         struct cgroup_subsys *ss;
         struct cgroup *child;
         bool empty;
@@ -4743,18 +4504,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
         dget(d);
         cgroup_d_remove_dir(d);
  
-       /*
-        * Unregister events and notify userspace.
-        * Notify userspace about cgroup removing only after rmdir of cgroup
-        * directory to avoid race between userspace and kernelspace.
-        */
-       spin_lock(&cgrp->event_list_lock);
-       list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
-               list_del_init(&event->list);
-               schedule_work(&event->remove);
-       }
-       spin_unlock(&cgrp->event_list_lock);
-
         return 0;
  };