]> Pileus Git - ~andy/linux/blobdiff - kernel/events/core.c
Merge branch 'for-3.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
[~andy/linux] / kernel / events / core.c
index 1b5c081d8b9f9c8ea05f1a8ecaf861f80ab7ba1c..4b50357914fb437a30cd146e1bd33e1f2b43c449 100644 (file)
@@ -118,6 +118,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
                       PERF_FLAG_FD_OUTPUT  |\
                       PERF_FLAG_PID_CGROUP)
 
+/*
+ * branch priv levels that need permission checks
+ */
+#define PERF_SAMPLE_BRANCH_PERM_PLM \
+       (PERF_SAMPLE_BRANCH_KERNEL |\
+        PERF_SAMPLE_BRANCH_HV)
+
 enum event_type_t {
        EVENT_FLEXIBLE = 0x1,
        EVENT_PINNED = 0x2,
@@ -128,8 +135,9 @@ enum event_type_t {
  * perf_sched_events : >0 events exist
  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
  */
-struct jump_label_key_deferred perf_sched_events __read_mostly;
+struct static_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -881,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
        if (is_cgroup_event(event))
                ctx->nr_cgroups++;
 
+       if (has_branch_stack(event))
+               ctx->nr_branch_stack++;
+
        list_add_rcu(&event->event_entry, &ctx->event_list);
        if (!ctx->nr_events)
                perf_pmu_rotate_start(ctx->pmu);
@@ -1020,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
                        cpuctx->cgrp = NULL;
        }
 
+       if (has_branch_stack(event))
+               ctx->nr_branch_stack--;
+
        ctx->nr_events--;
        if (event->attr.inherit_stat)
                ctx->nr_stat--;
@@ -2194,6 +2208,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
        perf_pmu_rotate_start(ctx->pmu);
 }
 
+/*
+ * When sampling the branck stack in system-wide, it may be necessary
+ * to flush the stack on context switch. This happens when the branch
+ * stack does not tag its entries with the pid of the current task.
+ * Otherwise it becomes impossible to associate a branch entry with a
+ * task. This ambiguity is more likely to appear when the branch stack
+ * supports priv level filtering and the user sets it to monitor only
+ * at the user level (which could be a useful measurement in system-wide
+ * mode). In that case, the risk is high of having a branch stack with
+ * branch from multiple tasks. Flushing may mean dropping the existing
+ * entries or stashing them somewhere in the PMU specific code layer.
+ *
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when there is at least one system-wide context
+ * with at least one active event using taken branch sampling.
+ */
+static void perf_branch_stack_sched_in(struct task_struct *prev,
+                                      struct task_struct *task)
+{
+       struct perf_cpu_context *cpuctx;
+       struct pmu *pmu;
+       unsigned long flags;
+
+       /* no need to flush branch stack if not changing task */
+       if (prev == task)
+               return;
+
+       local_irq_save(flags);
+
+       rcu_read_lock();
+
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+               /*
+                * check if the context has at least one
+                * event using PERF_SAMPLE_BRANCH_STACK
+                */
+               if (cpuctx->ctx.nr_branch_stack > 0
+                   && pmu->flush_branch_stack) {
+
+                       pmu = cpuctx->ctx.pmu;
+
+                       perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+                       perf_pmu_disable(pmu);
+
+                       pmu->flush_branch_stack();
+
+                       perf_pmu_enable(pmu);
+
+                       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+               }
+       }
+
+       rcu_read_unlock();
+
+       local_irq_restore(flags);
+}
+
 /*
  * Called from scheduler to add the events of the current task
  * with interrupts disabled.
@@ -2225,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev,
         */
        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
                perf_cgroup_sched_in(prev, task);
+
+       /* check for system-wide branch_stack events */
+       if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
+               perf_branch_stack_sched_in(prev, task);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2778,7 +2856,7 @@ static void free_event(struct perf_event *event)
 
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                       jump_label_dec_deferred(&perf_sched_events);
+                       static_key_slow_dec_deferred(&perf_sched_events);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_dec(&nr_mmap_events);
                if (event->attr.comm)
@@ -2789,7 +2867,15 @@ static void free_event(struct perf_event *event)
                        put_callchain_buffers();
                if (is_cgroup_event(event)) {
                        atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
-                       jump_label_dec_deferred(&perf_sched_events);
+                       static_key_slow_dec_deferred(&perf_sched_events);
+               }
+
+               if (has_branch_stack(event)) {
+                       static_key_slow_dec_deferred(&perf_sched_events);
+                       /* is system-wide event */
+                       if (!(event->attach_state & PERF_ATTACH_TASK))
+                               atomic_dec(&per_cpu(perf_branch_stack_events,
+                                                   event->cpu));
                }
        }
 
@@ -3238,10 +3324,6 @@ int perf_event_task_disable(void)
        return 0;
 }
 
-#ifndef PERF_EVENT_INDEX_OFFSET
-# define PERF_EVENT_INDEX_OFFSET 0
-#endif
-
 static int perf_event_index(struct perf_event *event)
 {
        if (event->hw.state & PERF_HES_STOPPED)
@@ -3250,21 +3332,26 @@ static int perf_event_index(struct perf_event *event)
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return 0;
 
-       return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
+       return event->pmu->event_idx(event);
 }
 
 static void calc_timer_values(struct perf_event *event,
+                               u64 *now,
                                u64 *enabled,
                                u64 *running)
 {
-       u64 now, ctx_time;
+       u64 ctx_time;
 
-       now = perf_clock();
-       ctx_time = event->shadow_ctx_time + now;
+       *now = perf_clock();
+       ctx_time = event->shadow_ctx_time + *now;
        *enabled = ctx_time - event->tstamp_enabled;
        *running = ctx_time - event->tstamp_running;
 }
 
+void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
+{
+}
+
 /*
  * Callers need to ensure there can be no nesting of this function, otherwise
  * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3274,7 +3361,7 @@ void perf_event_update_userpage(struct perf_event *event)
 {
        struct perf_event_mmap_page *userpg;
        struct ring_buffer *rb;
-       u64 enabled, running;
+       u64 enabled, running, now;
 
        rcu_read_lock();
        /*
@@ -3286,7 +3373,7 @@ void perf_event_update_userpage(struct perf_event *event)
         * because of locking issue as we can be called in
         * NMI context
         */
-       calc_timer_values(event, &enabled, &running);
+       calc_timer_values(event, &now, &enabled, &running);
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;
@@ -3302,7 +3389,7 @@ void perf_event_update_userpage(struct perf_event *event)
        barrier();
        userpg->index = perf_event_index(event);
        userpg->offset = perf_event_count(event);
-       if (event->state == PERF_EVENT_STATE_ACTIVE)
+       if (userpg->index)
                userpg->offset -= local64_read(&event->hw.prev_count);
 
        userpg->time_enabled = enabled +
@@ -3311,6 +3398,8 @@ void perf_event_update_userpage(struct perf_event *event)
        userpg->time_running = running +
                        atomic64_read(&event->child_total_time_running);
 
+       perf_update_user_clock(userpg, now);
+
        barrier();
        ++userpg->lock;
        preempt_enable();
@@ -3568,6 +3657,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        event->mmap_user = get_current_user();
        vma->vm_mm->pinned_vm += event->mmap_locked;
 
+       perf_event_update_userpage(event);
+
 unlock:
        if (!ret)
                atomic_inc(&event->mmap_count);
@@ -3799,7 +3890,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
 static void perf_output_read(struct perf_output_handle *handle,
                             struct perf_event *event)
 {
-       u64 enabled = 0, running = 0;
+       u64 enabled = 0, running = 0, now;
        u64 read_format = event->attr.read_format;
 
        /*
@@ -3812,7 +3903,7 @@ static void perf_output_read(struct perf_output_handle *handle,
         * NMI context
         */
        if (read_format & PERF_FORMAT_TOTAL_TIMES)
-               calc_timer_values(event, &enabled, &running);
+               calc_timer_values(event, &now, &enabled, &running);
 
        if (event->attr.read_format & PERF_FORMAT_GROUP)
                perf_output_read_group(handle, event, enabled, running);
@@ -3902,6 +3993,24 @@ void perf_output_sample(struct perf_output_handle *handle,
                        }
                }
        }
+
+       if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+               if (data->br_stack) {
+                       size_t size;
+
+                       size = data->br_stack->nr
+                            * sizeof(struct perf_branch_entry);
+
+                       perf_output_put(handle, data->br_stack->nr);
+                       perf_output_copy(handle, data->br_stack->entries, size);
+               } else {
+                       /*
+                        * we always store at least the value of nr
+                        */
+                       u64 nr = 0;
+                       perf_output_put(handle, nr);
+               }
+       }
 }
 
 void perf_prepare_sample(struct perf_event_header *header,
@@ -3944,6 +4053,15 @@ void perf_prepare_sample(struct perf_event_header *header,
                WARN_ON_ONCE(size & (sizeof(u64)-1));
                header->size += size;
        }
+
+       if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
+               int size = sizeof(u64); /* nr */
+               if (data->br_stack) {
+                       size += data->br_stack->nr
+                             * sizeof(struct perf_branch_entry);
+               }
+               header->size += size;
+       }
 }
 
 static void perf_event_output(struct perf_event *event,
@@ -4986,7 +5104,7 @@ fail:
        return err;
 }
 
-struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
+struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
 static void sw_perf_event_destroy(struct perf_event *event)
 {
@@ -4994,7 +5112,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
 
        WARN_ON(event->parent);
 
-       jump_label_dec(&perf_swevent_enabled[event_id]);
+       static_key_slow_dec(&perf_swevent_enabled[event_id]);
        swevent_hlist_put(event);
 }
 
@@ -5005,6 +5123,12 @@ static int perf_swevent_init(struct perf_event *event)
        if (event->attr.type != PERF_TYPE_SOFTWARE)
                return -ENOENT;
 
+       /*
+        * no branch sampling for software events
+        */
+       if (has_branch_stack(event))
+               return -EOPNOTSUPP;
+
        switch (event_id) {
        case PERF_COUNT_SW_CPU_CLOCK:
        case PERF_COUNT_SW_TASK_CLOCK:
@@ -5024,13 +5148,18 @@ static int perf_swevent_init(struct perf_event *event)
                if (err)
                        return err;
 
-               jump_label_inc(&perf_swevent_enabled[event_id]);
+               static_key_slow_inc(&perf_swevent_enabled[event_id]);
                event->destroy = sw_perf_event_destroy;
        }
 
        return 0;
 }
 
+static int perf_swevent_event_idx(struct perf_event *event)
+{
+       return 0;
+}
+
 static struct pmu perf_swevent = {
        .task_ctx_nr    = perf_sw_context,
 
@@ -5040,6 +5169,8 @@ static struct pmu perf_swevent = {
        .start          = perf_swevent_start,
        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
+
+       .event_idx      = perf_swevent_event_idx,
 };
 
 #ifdef CONFIG_EVENT_TRACING
@@ -5108,6 +5239,12 @@ static int perf_tp_event_init(struct perf_event *event)
        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                return -ENOENT;
 
+       /*
+        * no branch sampling for tracepoint events
+        */
+       if (has_branch_stack(event))
+               return -EOPNOTSUPP;
+
        err = perf_trace_init(event);
        if (err)
                return err;
@@ -5126,6 +5263,8 @@ static struct pmu perf_tracepoint = {
        .start          = perf_swevent_start,
        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
+
+       .event_idx      = perf_swevent_event_idx,
 };
 
 static inline void perf_tp_register(void)
@@ -5331,6 +5470,12 @@ static int cpu_clock_event_init(struct perf_event *event)
        if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
                return -ENOENT;
 
+       /*
+        * no branch sampling for software events
+        */
+       if (has_branch_stack(event))
+               return -EOPNOTSUPP;
+
        perf_swevent_init_hrtimer(event);
 
        return 0;
@@ -5345,6 +5490,8 @@ static struct pmu perf_cpu_clock = {
        .start          = cpu_clock_event_start,
        .stop           = cpu_clock_event_stop,
        .read           = cpu_clock_event_read,
+
+       .event_idx      = perf_swevent_event_idx,
 };
 
 /*
@@ -5403,6 +5550,12 @@ static int task_clock_event_init(struct perf_event *event)
        if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
                return -ENOENT;
 
+       /*
+        * no branch sampling for software events
+        */
+       if (has_branch_stack(event))
+               return -EOPNOTSUPP;
+
        perf_swevent_init_hrtimer(event);
 
        return 0;
@@ -5417,6 +5570,8 @@ static struct pmu perf_task_clock = {
        .start          = task_clock_event_start,
        .stop           = task_clock_event_stop,
        .read           = task_clock_event_read,
+
+       .event_idx      = perf_swevent_event_idx,
 };
 
 static void perf_pmu_nop_void(struct pmu *pmu)
@@ -5444,6 +5599,11 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
        perf_pmu_enable(pmu);
 }
 
+static int perf_event_idx_default(struct perf_event *event)
+{
+       return event->hw.idx + 1;
+}
+
 /*
  * Ensures all contexts with the same task_ctx_nr have the same
  * pmu_cpu_context too.
@@ -5530,6 +5690,7 @@ static int pmu_dev_alloc(struct pmu *pmu)
        if (!pmu->dev)
                goto out;
 
+       pmu->dev->groups = pmu->attr_groups;
        device_initialize(pmu->dev);
        ret = dev_set_name(pmu->dev, "%s", pmu->name);
        if (ret)
@@ -5633,6 +5794,9 @@ got_cpu_context:
                pmu->pmu_disable = perf_pmu_nop_void;
        }
 
+       if (!pmu->event_idx)
+               pmu->event_idx = perf_event_idx_default;
+
        list_add_rcu(&pmu->entry, &pmus);
        ret = 0;
 unlock:
@@ -5825,7 +5989,7 @@ done:
 
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                       jump_label_inc(&perf_sched_events.key);
+                       static_key_slow_inc(&perf_sched_events.key);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_inc(&nr_mmap_events);
                if (event->attr.comm)
@@ -5839,6 +6003,12 @@ done:
                                return ERR_PTR(err);
                        }
                }
+               if (has_branch_stack(event)) {
+                       static_key_slow_inc(&perf_sched_events.key);
+                       if (!(event->attach_state & PERF_ATTACH_TASK))
+                               atomic_inc(&per_cpu(perf_branch_stack_events,
+                                                   event->cpu));
+               }
        }
 
        return event;
@@ -5908,6 +6078,40 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
        if (attr->read_format & ~(PERF_FORMAT_MAX-1))
                return -EINVAL;
 
+       if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
+               u64 mask = attr->branch_sample_type;
+
+               /* only using defined bits */
+               if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
+                       return -EINVAL;
+
+               /* at least one branch bit must be set */
+               if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
+                       return -EINVAL;
+
+               /* kernel level capture: check permissions */
+               if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
+                   && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+                       return -EACCES;
+
+               /* propagate priv level, when not set for branch */
+               if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
+
+                       /* exclude_kernel checked on syscall entry */
+                       if (!attr->exclude_kernel)
+                               mask |= PERF_SAMPLE_BRANCH_KERNEL;
+
+                       if (!attr->exclude_user)
+                               mask |= PERF_SAMPLE_BRANCH_USER;
+
+                       if (!attr->exclude_hv)
+                               mask |= PERF_SAMPLE_BRANCH_HV;
+                       /*
+                        * adjust user setting (for HW filter setup)
+                        */
+                       attr->branch_sample_type = mask;
+               }
+       }
 out:
        return ret;
 
@@ -6063,7 +6267,7 @@ SYSCALL_DEFINE5(perf_event_open,
                 * - that may need work on context switch
                 */
                atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
-               jump_label_inc(&perf_sched_events.key);
+               static_key_slow_inc(&perf_sched_events.key);
        }
 
        /*
@@ -6943,8 +7147,7 @@ unlock:
 device_initcall(perf_event_sysfs_init);
 
 #ifdef CONFIG_CGROUP_PERF
-static struct cgroup_subsys_state *perf_cgroup_create(
-       struct cgroup_subsys *ss, struct cgroup *cont)
+static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
 {
        struct perf_cgroup *jc;
 
@@ -6961,8 +7164,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(
        return &jc->css;
 }
 
-static void perf_cgroup_destroy(struct cgroup_subsys *ss,
-                               struct cgroup *cont)
+static void perf_cgroup_destroy(struct cgroup *cont)
 {
        struct perf_cgroup *jc;
        jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -6978,8 +7180,7 @@ static int __perf_cgroup_move(void *info)
        return 0;
 }
 
-static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                              struct cgroup_taskset *tset)
+static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
        struct task_struct *task;
 
@@ -6987,8 +7188,8 @@ static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
                task_function_call(task, __perf_cgroup_move, task);
 }
 
-static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
-               struct cgroup *old_cgrp, struct task_struct *task)
+static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
+                            struct task_struct *task)
 {
        /*
         * cgroup_exit() is called in the copy_process() failure path.