Merge branch 'for-3.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

[~andy/linux] / kernel / events / core.c
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 1b5c081d8b9f9c8ea05f1a8ecaf861f80ab7ba1c..4b50357914fb437a30cd146e1bd33e1f2b43c449 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -118,6 +118,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
                        PERF_FLAG_FD_OUTPUT  |\
                        PERF_FLAG_PID_CGROUP)
  
+/*
+ * branch priv levels that need permission checks
+ */
+#define PERF_SAMPLE_BRANCH_PERM_PLM \
+       (PERF_SAMPLE_BRANCH_KERNEL |\
+        PERF_SAMPLE_BRANCH_HV)
+
  enum event_type_t {
         EVENT_FLEXIBLE = 0x1,
         EVENT_PINNED = 0x2,
@@ -128,8 +135,9 @@ enum event_type_t {
   * perf_sched_events : >0 events exist
   * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
   */
-struct jump_label_key_deferred perf_sched_events __read_mostly;
+struct static_key_deferred perf_sched_events __read_mostly;
  static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
  
  static atomic_t nr_mmap_events __read_mostly;
  static atomic_t nr_comm_events __read_mostly;
@@ -881,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
         if (is_cgroup_event(event))
                 ctx->nr_cgroups++;
  
+       if (has_branch_stack(event))
+               ctx->nr_branch_stack++;
+
         list_add_rcu(&event->event_entry, &ctx->event_list);
         if (!ctx->nr_events)
                 perf_pmu_rotate_start(ctx->pmu);
@@ -1020,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
                         cpuctx->cgrp = NULL;
         }
  
+       if (has_branch_stack(event))
+               ctx->nr_branch_stack--;
+
         ctx->nr_events--;
         if (event->attr.inherit_stat)
                 ctx->nr_stat--;
@@ -2194,6 +2208,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
         perf_pmu_rotate_start(ctx->pmu);
  }
  
+/*
+ * When sampling the branck stack in system-wide, it may be necessary
+ * to flush the stack on context switch. This happens when the branch
+ * stack does not tag its entries with the pid of the current task.
+ * Otherwise it becomes impossible to associate a branch entry with a
+ * task. This ambiguity is more likely to appear when the branch stack
+ * supports priv level filtering and the user sets it to monitor only
+ * at the user level (which could be a useful measurement in system-wide
+ * mode). In that case, the risk is high of having a branch stack with
+ * branch from multiple tasks. Flushing may mean dropping the existing
+ * entries or stashing them somewhere in the PMU specific code layer.
+ *
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when there is at least one system-wide context
+ * with at least one active event using taken branch sampling.
+ */
+static void perf_branch_stack_sched_in(struct task_struct *prev,
+                                      struct task_struct *task)
+{
+       struct perf_cpu_context *cpuctx;
+       struct pmu *pmu;
+       unsigned long flags;
+
+       /* no need to flush branch stack if not changing task */
+       if (prev == task)
+               return;
+
+       local_irq_save(flags);
+
+       rcu_read_lock();
+
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+               /*
+                * check if the context has at least one
+                * event using PERF_SAMPLE_BRANCH_STACK
+                */
+               if (cpuctx->ctx.nr_branch_stack > 0
+                   && pmu->flush_branch_stack) {
+
+                       pmu = cpuctx->ctx.pmu;
+
+                       perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+                       perf_pmu_disable(pmu);
+
+                       pmu->flush_branch_stack();
+
+                       perf_pmu_enable(pmu);
+
+                       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+               }
+       }
+
+       rcu_read_unlock();
+
+       local_irq_restore(flags);
+}
+
  /*
   * Called from scheduler to add the events of the current task
   * with interrupts disabled.
@@ -2225,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev,
          */
         if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
                 perf_cgroup_sched_in(prev, task);
+
+       /* check for system-wide branch_stack events */
+       if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
+               perf_branch_stack_sched_in(prev, task);
  }
  
  static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2778,7 +2856,7 @@ static void free_event(struct perf_event *event)
  
         if (!event->parent) {
                 if (event->attach_state & PERF_ATTACH_TASK)
-                       jump_label_dec_deferred(&perf_sched_events);
+                       static_key_slow_dec_deferred(&perf_sched_events);
                 if (event->attr.mmap || event->attr.mmap_data)
                         atomic_dec(&nr_mmap_events);
                 if (event->attr.comm)
@@ -2789,7 +2867,15 @@ static void free_event(struct perf_event *event)
                         put_callchain_buffers();
                 if (is_cgroup_event(event)) {
                         atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
-                       jump_label_dec_deferred(&perf_sched_events);
+                       static_key_slow_dec_deferred(&perf_sched_events);
+               }
+
+               if (has_branch_stack(event)) {
+                       static_key_slow_dec_deferred(&perf_sched_events);
+                       /* is system-wide event */
+                       if (!(event->attach_state & PERF_ATTACH_TASK))
+                               atomic_dec(&per_cpu(perf_branch_stack_events,
+                                                   event->cpu));
                 }
         }
  
@@ -3238,10 +3324,6 @@ int perf_event_task_disable(void)
         return 0;
  }
  
-#ifndef PERF_EVENT_INDEX_OFFSET
-# define PERF_EVENT_INDEX_OFFSET 0
-#endif
-
  static int perf_event_index(struct perf_event *event)
  {
         if (event->hw.state & PERF_HES_STOPPED)
@@ -3250,21 +3332,26 @@ static int perf_event_index(struct perf_event *event)
         if (event->state != PERF_EVENT_STATE_ACTIVE)
                 return 0;
  
-       return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
+       return event->pmu->event_idx(event);
  }
  
  static void calc_timer_values(struct perf_event *event,
+                               u64 *now,
                                 u64 *enabled,
                                 u64 *running)
  {
-       u64 now, ctx_time;
+       u64 ctx_time;
  
-       now = perf_clock();
-       ctx_time = event->shadow_ctx_time + now;
+       *now = perf_clock();
+       ctx_time = event->shadow_ctx_time + *now;
         *enabled = ctx_time - event->tstamp_enabled;
         *running = ctx_time - event->tstamp_running;
  }
  
+void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
+{
+}
+
  /*
   * Callers need to ensure there can be no nesting of this function, otherwise
   * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3274,7 +3361,7 @@ void perf_event_update_userpage(struct perf_event *event)
  {
         struct perf_event_mmap_page *userpg;
         struct ring_buffer *rb;
-       u64 enabled, running;
+       u64 enabled, running, now;
  
         rcu_read_lock();
         /*
@@ -3286,7 +3373,7 @@ void perf_event_update_userpage(struct perf_event *event)
          * because of locking issue as we can be called in
          * NMI context
          */
-       calc_timer_values(event, &enabled, &running);
+       calc_timer_values(event, &now, &enabled, &running);
         rb = rcu_dereference(event->rb);
         if (!rb)
                 goto unlock;
@@ -3302,7 +3389,7 @@ void perf_event_update_userpage(struct perf_event *event)
         barrier();
         userpg->index = perf_event_index(event);
         userpg->offset = perf_event_count(event);
-       if (event->state == PERF_EVENT_STATE_ACTIVE)
+       if (userpg->index)
                 userpg->offset -= local64_read(&event->hw.prev_count);
  
         userpg->time_enabled = enabled +
@@ -3311,6 +3398,8 @@ void perf_event_update_userpage(struct perf_event *event)
         userpg->time_running = running +
                         atomic64_read(&event->child_total_time_running);
  
+       perf_update_user_clock(userpg, now);
+
         barrier();
         ++userpg->lock;
         preempt_enable();
@@ -3568,6 +3657,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
         event->mmap_user = get_current_user();
         vma->vm_mm->pinned_vm += event->mmap_locked;
  
+       perf_event_update_userpage(event);
+
  unlock:
         if (!ret)
                 atomic_inc(&event->mmap_count);
@@ -3799,7 +3890,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
  static void perf_output_read(struct perf_output_handle *handle,
                              struct perf_event *event)
  {
-       u64 enabled = 0, running = 0;
+       u64 enabled = 0, running = 0, now;
         u64 read_format = event->attr.read_format;
  
         /*
@@ -3812,7 +3903,7 @@ static void perf_output_read(struct perf_output_handle *handle,
          * NMI context
          */
         if (read_format & PERF_FORMAT_TOTAL_TIMES)
-               calc_timer_values(event, &enabled, &running);
+               calc_timer_values(event, &now, &enabled, &running);
  
         if (event->attr.read_format & PERF_FORMAT_GROUP)
                 perf_output_read_group(handle, event, enabled, running);
@@ -3902,6 +3993,24 @@ void perf_output_sample(struct perf_output_handle *handle,
                         }
                 }
         }
+
+       if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+               if (data->br_stack) {
+                       size_t size;
+
+                       size = data->br_stack->nr
+                            * sizeof(struct perf_branch_entry);
+
+                       perf_output_put(handle, data->br_stack->nr);
+                       perf_output_copy(handle, data->br_stack->entries, size);
+               } else {
+                       /*
+                        * we always store at least the value of nr
+                        */
+                       u64 nr = 0;
+                       perf_output_put(handle, nr);
+               }
+       }
  }
  
  void perf_prepare_sample(struct perf_event_header *header,
@@ -3944,6 +4053,15 @@ void perf_prepare_sample(struct perf_event_header *header,
                 WARN_ON_ONCE(size & (sizeof(u64)-1));
                 header->size += size;
         }
+
+       if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+               int size = sizeof(u64); /* nr */
+               if (data->br_stack) {
+                       size += data->br_stack->nr
+                             * sizeof(struct perf_branch_entry);
+               }
+               header->size += size;
+       }
  }
  
  static void perf_event_output(struct perf_event *event,
@@ -4986,7 +5104,7 @@ fail:
         return err;
  }
  
-struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
+struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
  
  static void sw_perf_event_destroy(struct perf_event *event)
  {
@@ -4994,7 +5112,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
  
         WARN_ON(event->parent);
  
-       jump_label_dec(&perf_swevent_enabled[event_id]);
+       static_key_slow_dec(&perf_swevent_enabled[event_id]);
         swevent_hlist_put(event);
  }
  
@@ -5005,6 +5123,12 @@ static int perf_swevent_init(struct perf_event *event)
         if (event->attr.type != PERF_TYPE_SOFTWARE)
                 return -ENOENT;
  
+       /*
+        * no branch sampling for software events
+        */
+       if (has_branch_stack(event))
+               return -EOPNOTSUPP;
+
         switch (event_id) {
         case PERF_COUNT_SW_CPU_CLOCK:
         case PERF_COUNT_SW_TASK_CLOCK:
@@ -5024,13 +5148,18 @@ static int perf_swevent_init(struct perf_event *event)
                 if (err)
                         return err;
  
-               jump_label_inc(&perf_swevent_enabled[event_id]);
+               static_key_slow_inc(&perf_swevent_enabled[event_id]);
                 event->destroy = sw_perf_event_destroy;
         }
  
         return 0;
  }
  
+static int perf_swevent_event_idx(struct perf_event *event)
+{
+       return 0;
+}
+
  static struct pmu perf_swevent = {
         .task_ctx_nr    = perf_sw_context,
  
@@ -5040,6 +5169,8 @@ static struct pmu perf_swevent = {
         .start          = perf_swevent_start,
         .stop           = perf_swevent_stop,
         .read           = perf_swevent_read,
+
+       .event_idx      = perf_swevent_event_idx,
  };
  
  #ifdef CONFIG_EVENT_TRACING
@@ -5108,6 +5239,12 @@ static int perf_tp_event_init(struct perf_event *event)
         if (event->attr.type != PERF_TYPE_TRACEPOINT)
                 return -ENOENT;
  
+       /*
+        * no branch sampling for tracepoint events
+        */
+       if (has_branch_stack(event))
+               return -EOPNOTSUPP;
+
         err = perf_trace_init(event);
         if (err)
                 return err;
@@ -5126,6 +5263,8 @@ static struct pmu perf_tracepoint = {
         .start          = perf_swevent_start,
         .stop           = perf_swevent_stop,
         .read           = perf_swevent_read,
+
+       .event_idx      = perf_swevent_event_idx,
  };
  
  static inline void perf_tp_register(void)
@@ -5331,6 +5470,12 @@ static int cpu_clock_event_init(struct perf_event *event)
         if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
                 return -ENOENT;
  
+       /*
+        * no branch sampling for software events
+        */
+       if (has_branch_stack(event))
+               return -EOPNOTSUPP;
+
         perf_swevent_init_hrtimer(event);
  
         return 0;
@@ -5345,6 +5490,8 @@ static struct pmu perf_cpu_clock = {
         .start          = cpu_clock_event_start,
         .stop           = cpu_clock_event_stop,
         .read           = cpu_clock_event_read,
+
+       .event_idx      = perf_swevent_event_idx,
  };
  
  /*
@@ -5403,6 +5550,12 @@ static int task_clock_event_init(struct perf_event *event)
         if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
                 return -ENOENT;
  
+       /*
+        * no branch sampling for software events
+        */
+       if (has_branch_stack(event))
+               return -EOPNOTSUPP;
+
         perf_swevent_init_hrtimer(event);
  
         return 0;
@@ -5417,6 +5570,8 @@ static struct pmu perf_task_clock = {
         .start          = task_clock_event_start,
         .stop           = task_clock_event_stop,
         .read           = task_clock_event_read,
+
+       .event_idx      = perf_swevent_event_idx,
  };
  
  static void perf_pmu_nop_void(struct pmu *pmu)
@@ -5444,6 +5599,11 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
         perf_pmu_enable(pmu);
  }
  
+static int perf_event_idx_default(struct perf_event *event)
+{
+       return event->hw.idx + 1;
+}
+
  /*
   * Ensures all contexts with the same task_ctx_nr have the same
   * pmu_cpu_context too.
@@ -5530,6 +5690,7 @@ static int pmu_dev_alloc(struct pmu *pmu)
         if (!pmu->dev)
                 goto out;
  
+       pmu->dev->groups = pmu->attr_groups;
         device_initialize(pmu->dev);
         ret = dev_set_name(pmu->dev, "%s", pmu->name);
         if (ret)
@@ -5633,6 +5794,9 @@ got_cpu_context:
                 pmu->pmu_disable = perf_pmu_nop_void;
         }
  
+       if (!pmu->event_idx)
+               pmu->event_idx = perf_event_idx_default;
+
         list_add_rcu(&pmu->entry, &pmus);
         ret = 0;
  unlock:
@@ -5825,7 +5989,7 @@ done:
  
         if (!event->parent) {
                 if (event->attach_state & PERF_ATTACH_TASK)
-                       jump_label_inc(&perf_sched_events.key);
+                       static_key_slow_inc(&perf_sched_events.key);
                 if (event->attr.mmap || event->attr.mmap_data)
                         atomic_inc(&nr_mmap_events);
                 if (event->attr.comm)
@@ -5839,6 +6003,12 @@ done:
                                 return ERR_PTR(err);
                         }
                 }
+               if (has_branch_stack(event)) {
+                       static_key_slow_inc(&perf_sched_events.key);
+                       if (!(event->attach_state & PERF_ATTACH_TASK))
+                               atomic_inc(&per_cpu(perf_branch_stack_events,
+                                                   event->cpu));
+               }
         }
  
         return event;
@@ -5908,6 +6078,40 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
                 return -EINVAL;
  
+       if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
+               u64 mask = attr->branch_sample_type;
+
+               /* only using defined bits */
+               if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
+                       return -EINVAL;
+
+               /* at least one branch bit must be set */
+               if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
+                       return -EINVAL;
+
+               /* kernel level capture: check permissions */
+               if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
+                   && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+                       return -EACCES;
+
+               /* propagate priv level, when not set for branch */
+               if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
+
+                       /* exclude_kernel checked on syscall entry */
+                       if (!attr->exclude_kernel)
+                               mask |= PERF_SAMPLE_BRANCH_KERNEL;
+
+                       if (!attr->exclude_user)
+                               mask |= PERF_SAMPLE_BRANCH_USER;
+
+                       if (!attr->exclude_hv)
+                               mask |= PERF_SAMPLE_BRANCH_HV;
+                       /*
+                        * adjust user setting (for HW filter setup)
+                        */
+                       attr->branch_sample_type = mask;
+               }
+       }
  out:
         return ret;
  
@@ -6063,7 +6267,7 @@ SYSCALL_DEFINE5(perf_event_open,
                  * - that may need work on context switch
                  */
                 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
-               jump_label_inc(&perf_sched_events.key);
+               static_key_slow_inc(&perf_sched_events.key);
         }
  
         /*
@@ -6943,8 +7147,7 @@ unlock:
  device_initcall(perf_event_sysfs_init);
  
  #ifdef CONFIG_CGROUP_PERF
-static struct cgroup_subsys_state *perf_cgroup_create(
-       struct cgroup_subsys *ss, struct cgroup *cont)
+static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
  {
         struct perf_cgroup *jc;
  
@@ -6961,8 +7164,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(
         return &jc->css;
  }
  
-static void perf_cgroup_destroy(struct cgroup_subsys *ss,
-                               struct cgroup *cont)
+static void perf_cgroup_destroy(struct cgroup *cont)
  {
         struct perf_cgroup *jc;
         jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -6978,8 +7180,7 @@ static int __perf_cgroup_move(void *info)
         return 0;
  }
  
-static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                              struct cgroup_taskset *tset)
+static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
  {
         struct task_struct *task;
  
@@ -6987,8 +7188,8 @@ static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
                 task_function_call(task, __perf_cgroup_move, task);
  }
  
-static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
-               struct cgroup *old_cgrp, struct task_struct *task)
+static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
+                            struct task_struct *task)
  {
         /*
          * cgroup_exit() is called in the copy_process() failure path.