perfcounters: fix a few minor cleanliness issues

[~andy/linux] / kernel / perf_counter.c
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c

index 961d651aa5748903cbfad505c2f8aa9e1eb89f3a..16b14ba99d34d839604ef3bef76f70f2e8854e0b 100644 (file)
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -18,7 +18,10 @@
  #include <linux/uaccess.h>
  #include <linux/syscalls.h>
  #include <linux/anon_inodes.h>
+#include <linux/kernel_stat.h>
  #include <linux/perf_counter.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
  
  /*
   * Each CPU has a list of per CPU counters:
@@ -40,12 +43,20 @@ static DEFINE_MUTEX(perf_resource_mutex);
  extern __weak const struct hw_perf_counter_ops *
  hw_perf_counter_init(struct perf_counter *counter)
  {
-       return ERR_PTR(-EINVAL);
+       return NULL;
  }
  
  u64 __weak hw_perf_save_disable(void)          { return 0; }
-void __weak hw_perf_restore(u64 ctrl)          { }
-void __weak hw_perf_counter_setup(void)                { }
+void __weak hw_perf_restore(u64 ctrl)          { barrier(); }
+void __weak hw_perf_counter_setup(int cpu)     { barrier(); }
+int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
+              struct perf_cpu_context *cpuctx,
+              struct perf_counter_context *ctx, int cpu)
+{
+       return 0;
+}
+
+void __weak perf_counter_print_debug(void)     { }
  
  static void
  list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
@@ -84,6 +95,47 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
         }
  }
  
+static void
+counter_sched_out(struct perf_counter *counter,
+                 struct perf_cpu_context *cpuctx,
+                 struct perf_counter_context *ctx)
+{
+       if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+               return;
+
+       counter->state = PERF_COUNTER_STATE_INACTIVE;
+       counter->hw_ops->disable(counter);
+       counter->oncpu = -1;
+
+       if (!is_software_counter(counter))
+               cpuctx->active_oncpu--;
+       ctx->nr_active--;
+       if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
+               cpuctx->exclusive = 0;
+}
+
+static void
+group_sched_out(struct perf_counter *group_counter,
+               struct perf_cpu_context *cpuctx,
+               struct perf_counter_context *ctx)
+{
+       struct perf_counter *counter;
+
+       if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
+               return;
+
+       counter_sched_out(group_counter, cpuctx, ctx);
+
+       /*
+        * Schedule out siblings (if any):
+        */
+       list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+               counter_sched_out(counter, cpuctx, ctx);
+
+       if (group_counter->hw_event.exclusive)
+               cpuctx->exclusive = 0;
+}
+
  /*
   * Cross CPU call to remove a performance counter
   *
@@ -106,15 +158,12 @@ static void __perf_counter_remove_from_context(void *info)
         if (ctx->task && cpuctx->task_ctx != ctx)
                 return;
  
-       spin_lock_irqsave(&ctx->lock, flags);
+       curr_rq_lock_irq_save(&flags);
+       spin_lock(&ctx->lock);
  
-       if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
-               counter->hw_ops->disable(counter);
-               counter->state = PERF_COUNTER_STATE_INACTIVE;
-               ctx->nr_active--;
-               cpuctx->active_oncpu--;
-               counter->task = NULL;
-       }
+       counter_sched_out(counter, cpuctx, ctx);
+
+       counter->task = NULL;
         ctx->nr_counters--;
  
         /*
@@ -135,14 +184,15 @@ static void __perf_counter_remove_from_context(void *info)
                             perf_max_counters - perf_reserved_percpu);
         }
  
-       spin_unlock_irqrestore(&ctx->lock, flags);
+       spin_unlock(&ctx->lock);
+       curr_rq_unlock_irq_restore(&flags);
  }
  
  
  /*
   * Remove the counter from a task's (or a CPU's) list of counters.
   *
- * Must be called with counter->mutex held.
+ * Must be called with counter->mutex and ctx->mutex held.
   *
   * CPU counters are removed with a smp call. For task counters we only
   * call when the task is on a CPU.
@@ -190,16 +240,190 @@ retry:
  }
  
  /*
- * Cross CPU call to install and enable a preformance counter
+ * Cross CPU call to disable a performance counter
+ */
+static void __perf_counter_disable(void *info)
+{
+       struct perf_counter *counter = info;
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_counter_context *ctx = counter->ctx;
+       unsigned long flags;
+
+       /*
+        * If this is a per-task counter, need to check whether this
+        * counter's task is the current task on this cpu.
+        */
+       if (ctx->task && cpuctx->task_ctx != ctx)
+               return;
+
+       curr_rq_lock_irq_save(&flags);
+       spin_lock(&ctx->lock);
+
+       /*
+        * If the counter is on, turn it off.
+        * If it is in error state, leave it in error state.
+        */
+       if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+               if (counter == counter->group_leader)
+                       group_sched_out(counter, cpuctx, ctx);
+               else
+                       counter_sched_out(counter, cpuctx, ctx);
+               counter->state = PERF_COUNTER_STATE_OFF;
+       }
+
+       spin_unlock(&ctx->lock);
+       curr_rq_unlock_irq_restore(&flags);
+}
+
+/*
+ * Disable a counter.
+ */
+static void perf_counter_disable(struct perf_counter *counter)
+{
+       struct perf_counter_context *ctx = counter->ctx;
+       struct task_struct *task = ctx->task;
+
+       if (!task) {
+               /*
+                * Disable the counter on the cpu that it's on
+                */
+               smp_call_function_single(counter->cpu, __perf_counter_disable,
+                                        counter, 1);
+               return;
+       }
+
+ retry:
+       task_oncpu_function_call(task, __perf_counter_disable, counter);
+
+       spin_lock_irq(&ctx->lock);
+       /*
+        * If the counter is still active, we need to retry the cross-call.
+        */
+       if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
+               spin_unlock_irq(&ctx->lock);
+               goto retry;
+       }
+
+       /*
+        * Since we have the lock this context can't be scheduled
+        * in, so we can change the state safely.
+        */
+       if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+               counter->state = PERF_COUNTER_STATE_OFF;
+
+       spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Disable a counter and all its children.
+ */
+static void perf_counter_disable_family(struct perf_counter *counter)
+{
+       struct perf_counter *child;
+
+       perf_counter_disable(counter);
+
+       /*
+        * Lock the mutex to protect the list of children
+        */
+       mutex_lock(&counter->mutex);
+       list_for_each_entry(child, &counter->child_list, child_list)
+               perf_counter_disable(child);
+       mutex_unlock(&counter->mutex);
+}
+
+static int
+counter_sched_in(struct perf_counter *counter,
+                struct perf_cpu_context *cpuctx,
+                struct perf_counter_context *ctx,
+                int cpu)
+{
+       if (counter->state <= PERF_COUNTER_STATE_OFF)
+               return 0;
+
+       counter->state = PERF_COUNTER_STATE_ACTIVE;
+       counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
+       /*
+        * The new state must be visible before we turn it on in the hardware:
+        */
+       smp_wmb();
+
+       if (counter->hw_ops->enable(counter)) {
+               counter->state = PERF_COUNTER_STATE_INACTIVE;
+               counter->oncpu = -1;
+               return -EAGAIN;
+       }
+
+       if (!is_software_counter(counter))
+               cpuctx->active_oncpu++;
+       ctx->nr_active++;
+
+       if (counter->hw_event.exclusive)
+               cpuctx->exclusive = 1;
+
+       return 0;
+}
+
+/*
+ * Return 1 for a group consisting entirely of software counters,
+ * 0 if the group contains any hardware counters.
+ */
+static int is_software_only_group(struct perf_counter *leader)
+{
+       struct perf_counter *counter;
+
+       if (!is_software_counter(leader))
+               return 0;
+       list_for_each_entry(counter, &leader->sibling_list, list_entry)
+               if (!is_software_counter(counter))
+                       return 0;
+       return 1;
+}
+
+/*
+ * Work out whether we can put this counter group on the CPU now.
+ */
+static int group_can_go_on(struct perf_counter *counter,
+                          struct perf_cpu_context *cpuctx,
+                          int can_add_hw)
+{
+       /*
+        * Groups consisting entirely of software counters can always go on.
+        */
+       if (is_software_only_group(counter))
+               return 1;
+       /*
+        * If an exclusive group is already on, no other hardware
+        * counters can go on.
+        */
+       if (cpuctx->exclusive)
+               return 0;
+       /*
+        * If this group is exclusive and there are already
+        * counters on the CPU, it can't go on.
+        */
+       if (counter->hw_event.exclusive && cpuctx->active_oncpu)
+               return 0;
+       /*
+        * Otherwise, try to add it if all previous groups were able
+        * to go on.
+        */
+       return can_add_hw;
+}
+
+/*
+ * Cross CPU call to install and enable a performance counter
   */
  static void __perf_install_in_context(void *info)
  {
         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
         struct perf_counter *counter = info;
         struct perf_counter_context *ctx = counter->ctx;
+       struct perf_counter *leader = counter->group_leader;
         int cpu = smp_processor_id();
         unsigned long flags;
         u64 perf_flags;
+       int err;
  
         /*
          * If this is a task context, we need to check whether it is
@@ -209,30 +433,57 @@ static void __perf_install_in_context(void *info)
         if (ctx->task && cpuctx->task_ctx != ctx)
                 return;
  
-       spin_lock_irqsave(&ctx->lock, flags);
+       curr_rq_lock_irq_save(&flags);
+       spin_lock(&ctx->lock);
  
         /*
          * Protect the list operation against NMI by disabling the
          * counters on a global level. NOP for non NMI based counters.
          */
         perf_flags = hw_perf_save_disable();
-       list_add_counter(counter, ctx);
-       hw_perf_restore(perf_flags);
  
+       list_add_counter(counter, ctx);
         ctx->nr_counters++;
+       counter->prev_state = PERF_COUNTER_STATE_OFF;
  
-       if (cpuctx->active_oncpu < perf_max_counters) {
-               counter->state = PERF_COUNTER_STATE_ACTIVE;
-               counter->oncpu = cpu;
-               ctx->nr_active++;
-               cpuctx->active_oncpu++;
-               counter->hw_ops->enable(counter);
+       /*
+        * Don't put the counter on if it is disabled or if
+        * it is in a group and the group isn't on.
+        */
+       if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
+           (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
+               goto unlock;
+
+       /*
+        * An exclusive counter can't go on if there are already active
+        * hardware counters, and no hardware counter can go on if there
+        * is already an exclusive counter on.
+        */
+       if (!group_can_go_on(counter, cpuctx, 1))
+               err = -EEXIST;
+       else
+               err = counter_sched_in(counter, cpuctx, ctx, cpu);
+
+       if (err) {
+               /*
+                * This counter couldn't go on.  If it is in a group
+                * then we have to pull the whole group off.
+                * If the counter group is pinned then put it in error state.
+                */
+               if (leader != counter)
+                       group_sched_out(leader, cpuctx, ctx);
+               if (leader->hw_event.pinned)
+                       leader->state = PERF_COUNTER_STATE_ERROR;
         }
  
-       if (!ctx->task && cpuctx->max_pertask)
+       if (!err && !ctx->task && cpuctx->max_pertask)
                 cpuctx->max_pertask--;
  
-       spin_unlock_irqrestore(&ctx->lock, flags);
+ unlock:
+       hw_perf_restore(perf_flags);
+
+       spin_unlock(&ctx->lock);
+       curr_rq_unlock_irq_restore(&flags);
  }
  
  /*
@@ -244,6 +495,8 @@ static void __perf_install_in_context(void *info)
   * If the counter is attached to a task which is on a CPU we use a smp
   * call to enable it in the task context. The task might have been
   * scheduled away, but we check this in the smp call again.
+ *
+ * Must be called with ctx->mutex held.
   */
  static void
  perf_install_in_context(struct perf_counter_context *ctx,
@@ -252,7 +505,6 @@ perf_install_in_context(struct perf_counter_context *ctx,
  {
         struct task_struct *task = ctx->task;
  
-       counter->ctx = ctx;
         if (!task) {
                 /*
                  * Per cpu counters are installed via an smp call and
@@ -272,7 +524,7 @@ retry:
         /*
          * we need to retry the smp call.
          */
-       if (ctx->nr_active && list_empty(&counter->list_entry)) {
+       if (ctx->is_active && list_empty(&counter->list_entry)) {
                 spin_unlock_irq(&ctx->lock);
                 goto retry;
         }
@@ -289,36 +541,153 @@ retry:
         spin_unlock_irq(&ctx->lock);
  }
  
-static void
-counter_sched_out(struct perf_counter *counter,
-                 struct perf_cpu_context *cpuctx,
-                 struct perf_counter_context *ctx)
+/*
+ * Cross CPU call to enable a performance counter
+ */
+static void __perf_counter_enable(void *info)
  {
-       if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+       struct perf_counter *counter = info;
+       struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+       struct perf_counter_context *ctx = counter->ctx;
+       struct perf_counter *leader = counter->group_leader;
+       unsigned long flags;
+       int err;
+
+       /*
+        * If this is a per-task counter, need to check whether this
+        * counter's task is the current task on this cpu.
+        */
+       if (ctx->task && cpuctx->task_ctx != ctx)
                 return;
  
-       counter->hw_ops->disable(counter);
+       curr_rq_lock_irq_save(&flags);
+       spin_lock(&ctx->lock);
+
+       counter->prev_state = counter->state;
+       if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+               goto unlock;
         counter->state = PERF_COUNTER_STATE_INACTIVE;
-       counter->oncpu = -1;
  
-       cpuctx->active_oncpu--;
-       ctx->nr_active--;
+       /*
+        * If the counter is in a group and isn't the group leader,
+        * then don't put it on unless the group is on.
+        */
+       if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
+               goto unlock;
+
+       if (!group_can_go_on(counter, cpuctx, 1))
+               err = -EEXIST;
+       else
+               err = counter_sched_in(counter, cpuctx, ctx,
+                                      smp_processor_id());
+
+       if (err) {
+               /*
+                * If this counter can't go on and it's part of a
+                * group, then the whole group has to come off.
+                */
+               if (leader != counter)
+                       group_sched_out(leader, cpuctx, ctx);
+               if (leader->hw_event.pinned)
+                       leader->state = PERF_COUNTER_STATE_ERROR;
+       }
+
+ unlock:
+       spin_unlock(&ctx->lock);
+       curr_rq_unlock_irq_restore(&flags);
  }
  
-static void
-group_sched_out(struct perf_counter *group_counter,
-               struct perf_cpu_context *cpuctx,
-               struct perf_counter_context *ctx)
+/*
+ * Enable a counter.
+ */
+static void perf_counter_enable(struct perf_counter *counter)
  {
-       struct perf_counter *counter;
+       struct perf_counter_context *ctx = counter->ctx;
+       struct task_struct *task = ctx->task;
  
-       counter_sched_out(group_counter, cpuctx, ctx);
+       if (!task) {
+               /*
+                * Enable the counter on the cpu that it's on
+                */
+               smp_call_function_single(counter->cpu, __perf_counter_enable,
+                                        counter, 1);
+               return;
+       }
+
+       spin_lock_irq(&ctx->lock);
+       if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+               goto out;
  
         /*
-        * Schedule out siblings (if any):
+        * If the counter is in error state, clear that first.
+        * That way, if we see the counter in error state below, we
+        * know that it has gone back into error state, as distinct
+        * from the task having been scheduled away before the
+        * cross-call arrived.
          */
-       list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
-               counter_sched_out(counter, cpuctx, ctx);
+       if (counter->state == PERF_COUNTER_STATE_ERROR)
+               counter->state = PERF_COUNTER_STATE_OFF;
+
+ retry:
+       spin_unlock_irq(&ctx->lock);
+       task_oncpu_function_call(task, __perf_counter_enable, counter);
+
+       spin_lock_irq(&ctx->lock);
+
+       /*
+        * If the context is active and the counter is still off,
+        * we need to retry the cross-call.
+        */
+       if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
+               goto retry;
+
+       /*
+        * Since we have the lock this context can't be scheduled
+        * in, so we can change the state safely.
+        */
+       if (counter->state == PERF_COUNTER_STATE_OFF)
+               counter->state = PERF_COUNTER_STATE_INACTIVE;
+ out:
+       spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Enable a counter and all its children.
+ */
+static void perf_counter_enable_family(struct perf_counter *counter)
+{
+       struct perf_counter *child;
+
+       perf_counter_enable(counter);
+
+       /*
+        * Lock the mutex to protect the list of children
+        */
+       mutex_lock(&counter->mutex);
+       list_for_each_entry(child, &counter->child_list, child_list)
+               perf_counter_enable(child);
+       mutex_unlock(&counter->mutex);
+}
+
+void __perf_counter_sched_out(struct perf_counter_context *ctx,
+                             struct perf_cpu_context *cpuctx)
+{
+       struct perf_counter *counter;
+       u64 flags;
+
+       spin_lock(&ctx->lock);
+       ctx->is_active = 0;
+       if (likely(!ctx->nr_counters))
+               goto out;
+
+       flags = hw_perf_save_disable();
+       if (ctx->nr_active) {
+               list_for_each_entry(counter, &ctx->counter_list, list_entry)
+                       group_sched_out(counter, cpuctx, ctx);
+       }
+       hw_perf_restore(flags);
+ out:
+       spin_unlock(&ctx->lock);
  }
  
  /*
@@ -336,35 +705,18 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
  {
         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
         struct perf_counter_context *ctx = &task->perf_counter_ctx;
-       struct perf_counter *counter;
  
         if (likely(!cpuctx->task_ctx))
                 return;
  
-       spin_lock(&ctx->lock);
-       if (ctx->nr_active) {
-               list_for_each_entry(counter, &ctx->counter_list, list_entry)
-                       group_sched_out(counter, cpuctx, ctx);
-       }
-       spin_unlock(&ctx->lock);
-       cpuctx->task_ctx = NULL;
-}
-
-static void
-counter_sched_in(struct perf_counter *counter,
-                struct perf_cpu_context *cpuctx,
-                struct perf_counter_context *ctx,
-                int cpu)
-{
-       if (counter->state == PERF_COUNTER_STATE_OFF)
-               return;
-
-       counter->hw_ops->enable(counter);
-       counter->state = PERF_COUNTER_STATE_ACTIVE;
-       counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
+       __perf_counter_sched_out(ctx, cpuctx);
  
-       cpuctx->active_oncpu++;
-       ctx->nr_active++;
+       cpuctx->task_ctx = NULL;
+}
+
+static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
+{
+       __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
  }
  
  static int
@@ -373,46 +725,93 @@ group_sched_in(struct perf_counter *group_counter,
                struct perf_counter_context *ctx,
                int cpu)
  {
-       struct perf_counter *counter;
-       int was_group = 0;
+       struct perf_counter *counter, *partial_group;
+       int ret;
+
+       if (group_counter->state == PERF_COUNTER_STATE_OFF)
+               return 0;
+
+       ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
+       if (ret)
+               return ret < 0 ? ret : 0;
  
-       counter_sched_in(group_counter, cpuctx, ctx, cpu);
+       group_counter->prev_state = group_counter->state;
+       if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
+               return -EAGAIN;
  
         /*
          * Schedule in siblings as one group (if any):
          */
         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
-               counter_sched_in(counter, cpuctx, ctx, cpu);
-               was_group = 1;
+               counter->prev_state = counter->state;
+               if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
+                       partial_group = counter;
+                       goto group_error;
+               }
+       }
+
+       return 0;
+
+group_error:
+       /*
+        * Groups can be scheduled in as one unit only, so undo any
+        * partial group before returning:
+        */
+       list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+               if (counter == partial_group)
+                       break;
+               counter_sched_out(counter, cpuctx, ctx);
         }
+       counter_sched_out(group_counter, cpuctx, ctx);
  
-       return was_group;
+       return -EAGAIN;
  }
  
-/*
- * Called from scheduler to add the counters of the current task
- * with interrupts disabled.
- *
- * We restore the counter value and then enable it.
- *
- * This does not protect us against NMI, but enable()
- * sets the enabled bit in the control field of counter _before_
- * accessing the counter control register. If a NMI hits, then it will
- * keep the counter running.
- */
-void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+static void
+__perf_counter_sched_in(struct perf_counter_context *ctx,
+                       struct perf_cpu_context *cpuctx, int cpu)
  {
-       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-       struct perf_counter_context *ctx = &task->perf_counter_ctx;
         struct perf_counter *counter;
+       u64 flags;
+       int can_add_hw = 1;
  
+       spin_lock(&ctx->lock);
+       ctx->is_active = 1;
         if (likely(!ctx->nr_counters))
-               return;
+               goto out;
  
-       spin_lock(&ctx->lock);
+       flags = hw_perf_save_disable();
+
+       /*
+        * First go through the list and put on any pinned groups
+        * in order to give them the best chance of going on.
+        */
         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-               if (ctx->nr_active == cpuctx->max_pertask)
-                       break;
+               if (counter->state <= PERF_COUNTER_STATE_OFF ||
+                   !counter->hw_event.pinned)
+                       continue;
+               if (counter->cpu != -1 && counter->cpu != cpu)
+                       continue;
+
+               if (group_can_go_on(counter, cpuctx, 1))
+                       group_sched_in(counter, cpuctx, ctx, cpu);
+
+               /*
+                * If this pinned group hasn't been scheduled,
+                * put it in error state.
+                */
+               if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+                       counter->state = PERF_COUNTER_STATE_ERROR;
+       }
+
+       list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+               /*
+                * Ignore counters in OFF or ERROR state, and
+                * ignore pinned counters since we did them already.
+                */
+               if (counter->state <= PERF_COUNTER_STATE_OFF ||
+                   counter->hw_event.pinned)
+                       continue;
  
                 /*
                  * Listen to the 'cpu' scheduling filter constraint
@@ -421,32 +820,61 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
                 if (counter->cpu != -1 && counter->cpu != cpu)
                         continue;
  
-               /*
-                * If we scheduled in a group atomically and
-                * exclusively, break out:
-                */
-               if (group_sched_in(counter, cpuctx, ctx, cpu))
-                       break;
+               if (group_can_go_on(counter, cpuctx, can_add_hw)) {
+                       if (group_sched_in(counter, cpuctx, ctx, cpu))
+                               can_add_hw = 0;
+               }
         }
+       hw_perf_restore(flags);
+ out:
         spin_unlock(&ctx->lock);
+}
+
+/*
+ * Called from scheduler to add the counters of the current task
+ * with interrupts disabled.
+ *
+ * We restore the counter value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * keep the counter running.
+ */
+void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+{
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct perf_counter_context *ctx = &task->perf_counter_ctx;
  
+       __perf_counter_sched_in(ctx, cpuctx, cpu);
         cpuctx->task_ctx = ctx;
  }
  
+static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+{
+       struct perf_counter_context *ctx = &cpuctx->ctx;
+
+       __perf_counter_sched_in(ctx, cpuctx, cpu);
+}
+
  int perf_counter_task_disable(void)
  {
         struct task_struct *curr = current;
         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
         struct perf_counter *counter;
+       unsigned long flags;
         u64 perf_flags;
         int cpu;
  
         if (likely(!ctx->nr_counters))
                 return 0;
  
-       local_irq_disable();
+       curr_rq_lock_irq_save(&flags);
         cpu = smp_processor_id();
  
+       /* force the update of the task clock: */
+       __task_delta_exec(curr, 1);
+
         perf_counter_task_sched_out(curr, cpu);
  
         spin_lock(&ctx->lock);
@@ -456,14 +884,16 @@ int perf_counter_task_disable(void)
          */
         perf_flags = hw_perf_save_disable();
  
-       list_for_each_entry(counter, &ctx->counter_list, list_entry)
-               counter->state = PERF_COUNTER_STATE_OFF;
+       list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+               if (counter->state != PERF_COUNTER_STATE_ERROR)
+                       counter->state = PERF_COUNTER_STATE_OFF;
+       }
  
         hw_perf_restore(perf_flags);
  
         spin_unlock(&ctx->lock);
  
-       local_irq_enable();
+       curr_rq_unlock_irq_restore(&flags);
  
         return 0;
  }
@@ -473,15 +903,21 @@ int perf_counter_task_enable(void)
         struct task_struct *curr = current;
         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
         struct perf_counter *counter;
+       unsigned long flags;
         u64 perf_flags;
         int cpu;
  
         if (likely(!ctx->nr_counters))
                 return 0;
  
-       local_irq_disable();
+       curr_rq_lock_irq_save(&flags);
         cpu = smp_processor_id();
  
+       /* force the update of the task clock: */
+       __task_delta_exec(curr, 1);
+
+       perf_counter_task_sched_out(curr, cpu);
+
         spin_lock(&ctx->lock);
  
         /*
@@ -490,9 +926,10 @@ int perf_counter_task_enable(void)
         perf_flags = hw_perf_save_disable();
  
         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-               if (counter->state != PERF_COUNTER_STATE_OFF)
+               if (counter->state > PERF_COUNTER_STATE_OFF)
                         continue;
                 counter->state = PERF_COUNTER_STATE_INACTIVE;
+               counter->hw_event.disabled = 0;
         }
         hw_perf_restore(perf_flags);
  
@@ -500,24 +937,23 @@ int perf_counter_task_enable(void)
  
         perf_counter_task_sched_in(curr, cpu);
  
-       local_irq_enable();
+       curr_rq_unlock_irq_restore(&flags);
  
         return 0;
  }
  
-void perf_counter_task_tick(struct task_struct *curr, int cpu)
+/*
+ * Round-robin a context's counters:
+ */
+static void rotate_ctx(struct perf_counter_context *ctx)
  {
-       struct perf_counter_context *ctx = &curr->perf_counter_ctx;
         struct perf_counter *counter;
         u64 perf_flags;
  
-       if (likely(!ctx->nr_counters))
+       if (!ctx->nr_counters)
                 return;
  
-       perf_counter_task_sched_out(curr, cpu);
-
         spin_lock(&ctx->lock);
-
         /*
          * Rotate the first entry last (works just fine for group counters too):
          */
@@ -530,7 +966,24 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
         hw_perf_restore(perf_flags);
  
         spin_unlock(&ctx->lock);
+}
+
+void perf_counter_task_tick(struct task_struct *curr, int cpu)
+{
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+       const int rotate_percpu = 0;
+
+       if (rotate_percpu)
+               perf_counter_cpu_sched_out(cpuctx);
+       perf_counter_task_sched_out(curr, cpu);
+
+       if (rotate_percpu)
+               rotate_ctx(&cpuctx->ctx);
+       rotate_ctx(ctx);
  
+       if (rotate_percpu)
+               perf_counter_cpu_sched_in(cpuctx, cpu);
         perf_counter_task_sched_in(curr, cpu);
  }
  
@@ -540,8 +993,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
  static void __read(void *info)
  {
         struct perf_counter *counter = info;
+       unsigned long flags;
  
+       curr_rq_lock_irq_save(&flags);
         counter->hw_ops->read(counter);
+       curr_rq_unlock_irq_restore(&flags);
  }
  
  static u64 perf_counter_read(struct perf_counter *counter)
@@ -689,14 +1145,16 @@ static int perf_release(struct inode *inode, struct file *file)
  
         file->private_data = NULL;
  
+       mutex_lock(&ctx->mutex);
         mutex_lock(&counter->mutex);
  
         perf_counter_remove_from_context(counter);
-       put_context(ctx);
  
         mutex_unlock(&counter->mutex);
+       mutex_unlock(&ctx->mutex);
  
         kfree(counter);
+       put_context(ctx);
  
         return 0;
  }
@@ -712,6 +1170,14 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
         if (count != sizeof(cntval))
                 return -EINVAL;
  
+       /*
+        * Return end-of-file for a read on a counter that is in
+        * error state (i.e. because it was pinned but it couldn't be
+        * scheduled on to the CPU at some point).
+        */
+       if (counter->state == PERF_COUNTER_STATE_ERROR)
+               return 0;
+
         mutex_lock(&counter->mutex);
         cntval = perf_counter_read(counter);
         mutex_unlock(&counter->mutex);
@@ -747,7 +1213,7 @@ perf_read_irq_data(struct perf_counter     *counter,
  {
         struct perf_data *irqdata, *usrdata;
         DECLARE_WAITQUEUE(wait, current);
-       ssize_t res;
+       ssize_t res, res2;
  
         irqdata = counter->irqdata;
         usrdata = counter->usrdata;
@@ -768,6 +1234,9 @@ perf_read_irq_data(struct perf_counter     *counter,
                 if (signal_pending(current))
                         break;
  
+               if (counter->state == PERF_COUNTER_STATE_ERROR)
+                       break;
+
                 spin_unlock_irq(&counter->waitq.lock);
                 schedule();
                 spin_lock_irq(&counter->waitq.lock);
@@ -776,7 +1245,8 @@ perf_read_irq_data(struct perf_counter     *counter,
         __set_current_state(TASK_RUNNING);
         spin_unlock_irq(&counter->waitq.lock);
  
-       if (usrdata->len + irqdata->len < count)
+       if (usrdata->len + irqdata->len < count &&
+           counter->state != PERF_COUNTER_STATE_ERROR)
                 return -ERESTARTSYS;
  read_pending:
         mutex_lock(&counter->mutex);
@@ -788,11 +1258,12 @@ read_pending:
  
         /* Switch irq buffer: */
         usrdata = perf_switch_irq_data(counter);
-       if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
+       res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
+       if (res2 < 0) {
                 if (!res)
                         res = -EFAULT;
         } else {
-               res = count;
+               res += res2;
         }
  out:
         mutex_unlock(&counter->mutex);
@@ -833,25 +1304,60 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
         return events;
  }
  
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+       struct perf_counter *counter = file->private_data;
+       int err = 0;
+
+       switch (cmd) {
+       case PERF_COUNTER_IOC_ENABLE:
+               perf_counter_enable_family(counter);
+               break;
+       case PERF_COUNTER_IOC_DISABLE:
+               perf_counter_disable_family(counter);
+               break;
+       default:
+               err = -ENOTTY;
+       }
+       return err;
+}
+
  static const struct file_operations perf_fops = {
         .release                = perf_release,
         .read                   = perf_read,
         .poll                   = perf_poll,
+       .unlocked_ioctl         = perf_ioctl,
+       .compat_ioctl           = perf_ioctl,
  };
  
-static void cpu_clock_perf_counter_enable(struct perf_counter *counter)
+static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
+{
+       int cpu = raw_smp_processor_id();
+
+       atomic64_set(&counter->hw.prev_count, cpu_clock(cpu));
+       return 0;
+}
+
+static void cpu_clock_perf_counter_update(struct perf_counter *counter)
  {
+       int cpu = raw_smp_processor_id();
+       s64 prev;
+       u64 now;
+
+       now = cpu_clock(cpu);
+       prev = atomic64_read(&counter->hw.prev_count);
+       atomic64_set(&counter->hw.prev_count, now);
+       atomic64_add(now - prev, &counter->count);
  }
  
  static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
  {
+       cpu_clock_perf_counter_update(counter);
  }
  
  static void cpu_clock_perf_counter_read(struct perf_counter *counter)
  {
-       int cpu = raw_smp_processor_id();
-
-       atomic64_set(&counter->count, cpu_clock(cpu));
+       cpu_clock_perf_counter_update(counter);
  }
  
  static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
@@ -860,13 +1366,25 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
         .read           = cpu_clock_perf_counter_read,
  };
  
-static void task_clock_perf_counter_update(struct perf_counter *counter)
+/*
+ * Called from within the scheduler:
+ */
+static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
  {
-       u64 prev, now;
+       struct task_struct *curr = counter->task;
+       u64 delta;
+
+       delta = __task_delta_exec(curr, update);
+
+       return curr->se.sum_exec_runtime + delta;
+}
+
+static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
+{
+       u64 prev;
         s64 delta;
  
         prev = atomic64_read(&counter->hw.prev_count);
-       now = current->se.sum_exec_runtime;
  
         atomic64_set(&counter->hw.prev_count, now);
  
@@ -877,17 +1395,25 @@ static void task_clock_perf_counter_update(struct perf_counter *counter)
  
  static void task_clock_perf_counter_read(struct perf_counter *counter)
  {
-       task_clock_perf_counter_update(counter);
+       u64 now = task_clock_perf_counter_val(counter, 1);
+
+       task_clock_perf_counter_update(counter, now);
  }
  
-static void task_clock_perf_counter_enable(struct perf_counter *counter)
+static int task_clock_perf_counter_enable(struct perf_counter *counter)
  {
-       atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime);
+       if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+               atomic64_set(&counter->hw.prev_count,
+                            task_clock_perf_counter_val(counter, 0));
+
+       return 0;
  }
  
  static void task_clock_perf_counter_disable(struct perf_counter *counter)
  {
-       task_clock_perf_counter_update(counter);
+       u64 now = task_clock_perf_counter_val(counter, 0);
+
+       task_clock_perf_counter_update(counter, now);
  }
  
  static const struct hw_perf_counter_ops perf_ops_task_clock = {
@@ -896,11 +1422,19 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
         .read           = task_clock_perf_counter_read,
  };
  
-static u64 get_page_faults(void)
+#ifdef CONFIG_VM_EVENT_COUNTERS
+#define cpu_page_faults()      __get_cpu_var(vm_event_states).event[PGFAULT]
+#else
+#define cpu_page_faults()      0
+#endif
+
+static u64 get_page_faults(struct perf_counter *counter)
  {
-       struct task_struct *curr = current;
+       struct task_struct *curr = counter->ctx->task;
  
-       return curr->maj_flt + curr->min_flt;
+       if (curr)
+               return curr->maj_flt + curr->min_flt;
+       return cpu_page_faults();
  }
  
  static void page_faults_perf_counter_update(struct perf_counter *counter)
@@ -909,7 +1443,7 @@ static void page_faults_perf_counter_update(struct perf_counter *counter)
         s64 delta;
  
         prev = atomic64_read(&counter->hw.prev_count);
-       now = get_page_faults();
+       now = get_page_faults(counter);
  
         atomic64_set(&counter->hw.prev_count, now);
  
@@ -923,12 +1457,11 @@ static void page_faults_perf_counter_read(struct perf_counter *counter)
         page_faults_perf_counter_update(counter);
  }
  
-static void page_faults_perf_counter_enable(struct perf_counter *counter)
+static int page_faults_perf_counter_enable(struct perf_counter *counter)
  {
-       /*
-        * page-faults is a per-task value already,
-        * so we dont have to clear it on switch-in.
-        */
+       if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+               atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
+       return 0;
  }
  
  static void page_faults_perf_counter_disable(struct perf_counter *counter)
@@ -942,11 +1475,13 @@ static const struct hw_perf_counter_ops perf_ops_page_faults = {
         .read           = page_faults_perf_counter_read,
  };
  
-static u64 get_context_switches(void)
+static u64 get_context_switches(struct perf_counter *counter)
  {
-       struct task_struct *curr = current;
+       struct task_struct *curr = counter->ctx->task;
  
-       return curr->nvcsw + curr->nivcsw;
+       if (curr)
+               return curr->nvcsw + curr->nivcsw;
+       return cpu_nr_switches(smp_processor_id());
  }
  
  static void context_switches_perf_counter_update(struct perf_counter *counter)
@@ -955,7 +1490,7 @@ static void context_switches_perf_counter_update(struct perf_counter *counter)
         s64 delta;
  
         prev = atomic64_read(&counter->hw.prev_count);
-       now = get_context_switches();
+       now = get_context_switches(counter);
  
         atomic64_set(&counter->hw.prev_count, now);
  
@@ -969,12 +1504,12 @@ static void context_switches_perf_counter_read(struct perf_counter *counter)
         context_switches_perf_counter_update(counter);
  }
  
-static void context_switches_perf_counter_enable(struct perf_counter *counter)
+static int context_switches_perf_counter_enable(struct perf_counter *counter)
  {
-       /*
-        * ->nvcsw + curr->nivcsw is a per-task value already,
-        * so we dont have to clear it on switch-in.
-        */
+       if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+               atomic64_set(&counter->hw.prev_count,
+                            get_context_switches(counter));
+       return 0;
  }
  
  static void context_switches_perf_counter_disable(struct perf_counter *counter)
@@ -988,9 +1523,13 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = {
         .read           = context_switches_perf_counter_read,
  };
  
-static inline u64 get_cpu_migrations(void)
+static inline u64 get_cpu_migrations(struct perf_counter *counter)
  {
-       return current->se.nr_migrations;
+       struct task_struct *curr = counter->ctx->task;
+
+       if (curr)
+               return curr->se.nr_migrations;
+       return cpu_nr_migrations(smp_processor_id());
  }
  
  static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
@@ -999,7 +1538,7 @@ static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
         s64 delta;
  
         prev = atomic64_read(&counter->hw.prev_count);
-       now = get_cpu_migrations();
+       now = get_cpu_migrations(counter);
  
         atomic64_set(&counter->hw.prev_count, now);
  
@@ -1013,12 +1552,12 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
         cpu_migrations_perf_counter_update(counter);
  }
  
-static void cpu_migrations_perf_counter_enable(struct perf_counter *counter)
+static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
  {
-       /*
-        * se.nr_migrations is a per-task value already,
-        * so we dont have to clear it on switch-in.
-        */
+       if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+               atomic64_set(&counter->hw.prev_count,
+                            get_cpu_migrations(counter));
+       return 0;
  }
  
  static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
@@ -1037,21 +1576,46 @@ sw_perf_counter_init(struct perf_counter *counter)
  {
         const struct hw_perf_counter_ops *hw_ops = NULL;
  
+       /*
+        * Software counters (currently) can't in general distinguish
+        * between user, kernel and hypervisor events.
+        * However, context switches and cpu migrations are considered
+        * to be kernel events, and page faults are never hypervisor
+        * events.
+        */
         switch (counter->hw_event.type) {
         case PERF_COUNT_CPU_CLOCK:
-               hw_ops = &perf_ops_cpu_clock;
+               if (!(counter->hw_event.exclude_user ||
+                     counter->hw_event.exclude_kernel ||
+                     counter->hw_event.exclude_hv))
+                       hw_ops = &perf_ops_cpu_clock;
                 break;
         case PERF_COUNT_TASK_CLOCK:
-               hw_ops = &perf_ops_task_clock;
+               if (counter->hw_event.exclude_user ||
+                   counter->hw_event.exclude_kernel ||
+                   counter->hw_event.exclude_hv)
+                       break;
+               /*
+                * If the user instantiates this as a per-cpu counter,
+                * use the cpu_clock counter instead.
+                */
+               if (counter->ctx->task)
+                       hw_ops = &perf_ops_task_clock;
+               else
+                       hw_ops = &perf_ops_cpu_clock;
                 break;
         case PERF_COUNT_PAGE_FAULTS:
-               hw_ops = &perf_ops_page_faults;
+               if (!(counter->hw_event.exclude_user ||
+                     counter->hw_event.exclude_kernel))
+                       hw_ops = &perf_ops_page_faults;
                 break;
         case PERF_COUNT_CONTEXT_SWITCHES:
-               hw_ops = &perf_ops_context_switches;
+               if (!counter->hw_event.exclude_kernel)
+                       hw_ops = &perf_ops_context_switches;
                 break;
         case PERF_COUNT_CPU_MIGRATIONS:
-               hw_ops = &perf_ops_cpu_migrations;
+               if (!counter->hw_event.exclude_kernel)
+                       hw_ops = &perf_ops_cpu_migrations;
                 break;
         default:
                 break;
@@ -1065,6 +1629,7 @@ sw_perf_counter_init(struct perf_counter *counter)
  static struct perf_counter *
  perf_counter_alloc(struct perf_counter_hw_event *hw_event,
                    int cpu,
+                  struct perf_counter_context *ctx,
                    struct perf_counter *group_leader,
                    gfp_t gfpflags)
  {
@@ -1087,6 +1652,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
         INIT_LIST_HEAD(&counter->sibling_list);
         init_waitqueue_head(&counter->waitq);
  
+       INIT_LIST_HEAD(&counter->child_list);
+
         counter->irqdata                = &counter->data[0];
         counter->usrdata                = &counter->data[1];
         counter->cpu                    = cpu;
@@ -1094,14 +1661,16 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
         counter->wakeup_pending         = 0;
         counter->group_leader           = group_leader;
         counter->hw_ops                 = NULL;
+       counter->ctx                    = ctx;
  
+       counter->state = PERF_COUNTER_STATE_INACTIVE;
         if (hw_event->disabled)
                 counter->state = PERF_COUNTER_STATE_OFF;
  
         hw_ops = NULL;
         if (!hw_event->raw && hw_event->type < 0)
                 hw_ops = sw_perf_counter_init(counter);
-       if (!hw_ops)
+       else
                 hw_ops = hw_perf_counter_init(counter);
  
         if (!hw_ops) {
@@ -1121,9 +1690,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
   * @cpu:               target cpu
   * @group_fd:          group leader counter fd
   */
-asmlinkage int
-sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
-                     pid_t pid, int cpu, int group_fd)
+SYSCALL_DEFINE4(perf_counter_open,
+               const struct perf_counter_hw_event __user *, hw_event_uptr,
+               pid_t, pid, int, cpu, int, group_fd)
  {
         struct perf_counter *counter, *group_leader;
         struct perf_counter_hw_event hw_event;
@@ -1169,10 +1738,16 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
                  */
                 if (group_leader->ctx != ctx)
                         goto err_put_context;
+               /*
+                * Only a group leader can be exclusive or pinned
+                */
+               if (hw_event.exclusive || hw_event.pinned)
+                       goto err_put_context;
         }
  
         ret = -EINVAL;
-       counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
+       counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
+                                    GFP_KERNEL);
         if (!counter)
                 goto err_put_context;
  
@@ -1185,7 +1760,9 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
                 goto err_free_put_context;
  
         counter->filp = counter_file;
+       mutex_lock(&ctx->mutex);
         perf_install_in_context(ctx, counter, cpu);
+       mutex_unlock(&ctx->mutex);
  
         fput_light(counter_file, fput_needed2);
  
@@ -1212,6 +1789,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
  {
         memset(ctx, 0, sizeof(*ctx));
         spin_lock_init(&ctx->lock);
+       mutex_init(&ctx->mutex);
         INIT_LIST_HEAD(&ctx->counter_list);
         ctx->task = task;
  }
@@ -1219,31 +1797,39 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
  /*
   * inherit a counter from parent task to child task:
   */
-static int
+static struct perf_counter *
  inherit_counter(struct perf_counter *parent_counter,
               struct task_struct *parent,
               struct perf_counter_context *parent_ctx,
               struct task_struct *child,
+             struct perf_counter *group_leader,
               struct perf_counter_context *child_ctx)
  {
         struct perf_counter *child_counter;
  
+       /*
+        * Instead of creating recursive hierarchies of counters,
+        * we link inherited counters back to the original parent,
+        * which has a filp for sure, which we use as the reference
+        * count:
+        */
+       if (parent_counter->parent)
+               parent_counter = parent_counter->parent;
+
         child_counter = perf_counter_alloc(&parent_counter->hw_event,
-                                           parent_counter->cpu, NULL,
-                                           GFP_ATOMIC);
+                                          parent_counter->cpu, child_ctx,
+                                          group_leader, GFP_KERNEL);
         if (!child_counter)
-               return -ENOMEM;
+               return NULL;
  
         /*
          * Link it up in the child's context:
          */
-       child_counter->ctx = child_ctx;
         child_counter->task = child;
         list_add_counter(child_counter, child_ctx);
         child_ctx->nr_counters++;
  
         child_counter->parent = parent_counter;
-       parent_counter->nr_inherited++;
         /*
          * inherit into child's child as well:
          */
@@ -1257,44 +1843,119 @@ inherit_counter(struct perf_counter *parent_counter,
          */
         atomic_long_inc(&parent_counter->filp->f_count);
  
+       /*
+        * Link this into the parent counter's child list
+        */
+       mutex_lock(&parent_counter->mutex);
+       list_add_tail(&child_counter->child_list, &parent_counter->child_list);
+
+       /*
+        * Make the child state follow the state of the parent counter,
+        * not its hw_event.disabled bit.  We hold the parent's mutex,
+        * so we won't race with perf_counter_{en,dis}able_family.
+        */
+       if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
+               child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+       else
+               child_counter->state = PERF_COUNTER_STATE_OFF;
+
+       mutex_unlock(&parent_counter->mutex);
+
+       return child_counter;
+}
+
+static int inherit_group(struct perf_counter *parent_counter,
+             struct task_struct *parent,
+             struct perf_counter_context *parent_ctx,
+             struct task_struct *child,
+             struct perf_counter_context *child_ctx)
+{
+       struct perf_counter *leader;
+       struct perf_counter *sub;
+
+       leader = inherit_counter(parent_counter, parent, parent_ctx,
+                                child, NULL, child_ctx);
+       if (!leader)
+               return -ENOMEM;
+       list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
+               if (!inherit_counter(sub, parent, parent_ctx,
+                                    child, leader, child_ctx))
+                       return -ENOMEM;
+       }
         return 0;
  }
  
+static void sync_child_counter(struct perf_counter *child_counter,
+                              struct perf_counter *parent_counter)
+{
+       u64 parent_val, child_val;
+
+       parent_val = atomic64_read(&parent_counter->count);
+       child_val = atomic64_read(&child_counter->count);
+
+       /*
+        * Add back the child's count to the parent's count:
+        */
+       atomic64_add(child_val, &parent_counter->count);
+
+       /*
+        * Remove this counter from the parent's list
+        */
+       mutex_lock(&parent_counter->mutex);
+       list_del_init(&child_counter->child_list);
+       mutex_unlock(&parent_counter->mutex);
+
+       /*
+        * Release the parent counter, if this was the last
+        * reference to it.
+        */
+       fput(parent_counter->filp);
+}
+
  static void
  __perf_counter_exit_task(struct task_struct *child,
                          struct perf_counter *child_counter,
                          struct perf_counter_context *child_ctx)
  {
         struct perf_counter *parent_counter;
-       u64 parent_val, child_val;
-       u64 perf_flags;
+       struct perf_counter *sub, *tmp;
  
         /*
-        * Disable and unlink this counter.
-        *
-        * Be careful about zapping the list - IRQ/NMI context
-        * could still be processing it:
+        * If we do not self-reap then we have to wait for the
+        * child task to unschedule (it will happen for sure),
+        * so that its counter is at its final count. (This
+        * condition triggers rarely - child tasks usually get
+        * off their CPU before the parent has a chance to
+        * get this far into the reaping action)
          */
-       local_irq_disable();
-       perf_flags = hw_perf_save_disable();
-
-       if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
+       if (child != current) {
+               wait_task_inactive(child, 0);
+               list_del_init(&child_counter->list_entry);
+       } else {
                 struct perf_cpu_context *cpuctx;
+               unsigned long flags;
+               u64 perf_flags;
+
+               /*
+                * Disable and unlink this counter.
+                *
+                * Be careful about zapping the list - IRQ/NMI context
+                * could still be processing it:
+                */
+               curr_rq_lock_irq_save(&flags);
+               perf_flags = hw_perf_save_disable();
  
                 cpuctx = &__get_cpu_var(perf_cpu_context);
  
-               child_counter->hw_ops->disable(child_counter);
-               child_counter->state = PERF_COUNTER_STATE_INACTIVE;
-               child_counter->oncpu = -1;
+               group_sched_out(child_counter, cpuctx, child_ctx);
  
-               cpuctx->active_oncpu--;
-               child_ctx->nr_active--;
-       }
+               list_del_init(&child_counter->list_entry);
  
-       list_del_init(&child_counter->list_entry);
+               child_ctx->nr_counters--;
  
-       hw_perf_restore(perf_flags);
-       local_irq_enable();
+               hw_perf_restore(perf_flags);
+               curr_rq_unlock_irq_restore(&flags);
+       }
  
         parent_counter = child_counter->parent;
         /*
@@ -1302,26 +1963,23 @@ __perf_counter_exit_task(struct task_struct *child,
          * that are still around due to the child reference. These
          * counters need to be zapped - but otherwise linger.
          */
-       if (!parent_counter)
-               return;
-
-       parent_val = atomic64_read(&parent_counter->count);
-       child_val = atomic64_read(&child_counter->count);
-
-       /*
-        * Add back the child's count to the parent's count:
-        */
-       atomic64_add(child_val, &parent_counter->count);
-
-       fput(parent_counter->filp);
-
-       kfree(child_counter);
+       if (parent_counter) {
+               sync_child_counter(child_counter, parent_counter);
+               list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
+                                        list_entry) {
+                       if (sub->parent) {
+                               sync_child_counter(sub, sub->parent);
+                               kfree(sub);
+                       }
+               }
+               kfree(child_counter);
+       }
  }
  
  /*
- * When a child task exist, feed back counter values to parent counters.
+ * When a child task exits, feed back counter values to parent counters.
   *
- * Note: we are running in child context, but the PID is not hashed
+ * Note: we may be running in child context, but the PID is not hashed
   * anymore so new counters will not be added.
   */
  void perf_counter_exit_task(struct task_struct *child)
@@ -1345,9 +2003,8 @@ void perf_counter_exit_task(struct task_struct *child)
  void perf_counter_init_task(struct task_struct *child)
  {
         struct perf_counter_context *child_ctx, *parent_ctx;
-       struct perf_counter *counter, *parent_counter;
+       struct perf_counter *counter;
         struct task_struct *parent = current;
-       unsigned long flags;
  
         child_ctx  =  &child->perf_counter_ctx;
         parent_ctx = &parent->perf_counter_ctx;
@@ -1366,32 +2023,22 @@ void perf_counter_init_task(struct task_struct *child)
          * Lock the parent list. No need to lock the child - not PID
          * hashed yet and not running, so nobody can access it.
          */
-       spin_lock_irqsave(&parent_ctx->lock, flags);
+       mutex_lock(&parent_ctx->mutex);
  
         /*
          * We dont have to disable NMIs - we are only looking at
          * the list, not manipulating it:
          */
         list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
-               if (!counter->hw_event.inherit || counter->group_leader != counter)
+               if (!counter->hw_event.inherit)
                         continue;
  
-               /*
-                * Instead of creating recursive hierarchies of counters,
-                * we link inheritd counters back to the original parent,
-                * which has a filp for sure, which we use as the reference
-                * count:
-                */
-               parent_counter = counter;
-               if (counter->parent)
-                       parent_counter = counter->parent;
-
-               if (inherit_counter(parent_counter, parent,
+               if (inherit_group(counter, parent,
                                   parent_ctx, child, child_ctx))
                         break;
         }
  
-       spin_unlock_irqrestore(&parent_ctx->lock, flags);
+       mutex_unlock(&parent_ctx->mutex);
  }
  
  static void __cpuinit perf_counter_init_cpu(int cpu)
@@ -1405,7 +2052,7 @@ static void __cpuinit perf_counter_init_cpu(int cpu)
         cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
         mutex_unlock(&perf_resource_mutex);
  
-       hw_perf_counter_setup();
+       hw_perf_counter_setup(cpu);
  }
  
  #ifdef CONFIG_HOTPLUG_CPU
@@ -1417,11 +2064,15 @@ static void __perf_counter_exit_cpu(void *info)
  
         list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
                 __perf_counter_remove_from_context(counter);
-
  }
  static void perf_counter_exit_cpu(int cpu)
  {
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct perf_counter_context *ctx = &cpuctx->ctx;
+
+       mutex_lock(&ctx->mutex);
         smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
+       mutex_unlock(&ctx->mutex);
  }
  #else
  static inline void perf_counter_exit_cpu(int cpu) { }