#include <linux/anon_inodes.h>
#include <linux/kernel_stat.h>
#include <linux/perf_counter.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
/*
* Each CPU has a list of per CPU counters:
extern __weak const struct hw_perf_counter_ops *
hw_perf_counter_init(struct perf_counter *counter)
{
- return ERR_PTR(-EINVAL);
+ return NULL;
}
u64 __weak hw_perf_save_disable(void) { return 0; }
-void __weak hw_perf_restore(u64 ctrl) { }
-void __weak hw_perf_counter_setup(void) { }
+void __weak hw_perf_restore(u64 ctrl) { barrier(); }
+void __weak hw_perf_counter_setup(int cpu) { barrier(); }
+int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx, int cpu)
+{
+ return 0;
+}
+
+void __weak perf_counter_print_debug(void) { }
static void
list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
}
}
+static void
+counter_sched_out(struct perf_counter *counter,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx)
+{
+ if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+ return;
+
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->hw_ops->disable(counter);
+ counter->oncpu = -1;
+
+ if (!is_software_counter(counter))
+ cpuctx->active_oncpu--;
+ ctx->nr_active--;
+ if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
+ cpuctx->exclusive = 0;
+}
+
+static void
+group_sched_out(struct perf_counter *group_counter,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx)
+{
+ struct perf_counter *counter;
+
+ if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
+ return;
+
+ counter_sched_out(group_counter, cpuctx, ctx);
+
+ /*
+ * Schedule out siblings (if any):
+ */
+ list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+ counter_sched_out(counter, cpuctx, ctx);
+
+ if (group_counter->hw_event.exclusive)
+ cpuctx->exclusive = 0;
+}
+
/*
* Cross CPU call to remove a performance counter
*
curr_rq_lock_irq_save(&flags);
spin_lock(&ctx->lock);
- if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
- counter->state = PERF_COUNTER_STATE_INACTIVE;
- counter->hw_ops->disable(counter);
- ctx->nr_active--;
- cpuctx->active_oncpu--;
- counter->task = NULL;
- counter->oncpu = -1;
- }
+ counter_sched_out(counter, cpuctx, ctx);
+
+ counter->task = NULL;
ctx->nr_counters--;
/*
/*
* Remove the counter from a task's (or a CPU's) list of counters.
*
- * Must be called with counter->mutex held.
+ * Must be called with counter->mutex and ctx->mutex held.
*
* CPU counters are removed with a smp call. For task counters we only
* call when the task is on a CPU.
spin_unlock_irq(&ctx->lock);
}
+/*
+ * Cross CPU call to disable a performance counter
+ */
+static void __perf_counter_disable(void *info)
+{
+ struct perf_counter *counter = info;
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_counter_context *ctx = counter->ctx;
+ unsigned long flags;
+
+ /*
+ * If this is a per-task counter, need to check whether this
+ * counter's task is the current task on this cpu.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
+ return;
+
+ curr_rq_lock_irq_save(&flags);
+ spin_lock(&ctx->lock);
+
+ /*
+ * If the counter is on, turn it off.
+ * If it is in error state, leave it in error state.
+ */
+ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+ if (counter == counter->group_leader)
+ group_sched_out(counter, cpuctx, ctx);
+ else
+ counter_sched_out(counter, cpuctx, ctx);
+ counter->state = PERF_COUNTER_STATE_OFF;
+ }
+
+ spin_unlock(&ctx->lock);
+ curr_rq_unlock_irq_restore(&flags);
+}
+
+/*
+ * Disable a counter.
+ */
+static void perf_counter_disable(struct perf_counter *counter)
+{
+ struct perf_counter_context *ctx = counter->ctx;
+ struct task_struct *task = ctx->task;
+
+ if (!task) {
+ /*
+ * Disable the counter on the cpu that it's on
+ */
+ smp_call_function_single(counter->cpu, __perf_counter_disable,
+ counter, 1);
+ return;
+ }
+
+ retry:
+ task_oncpu_function_call(task, __perf_counter_disable, counter);
+
+ spin_lock_irq(&ctx->lock);
+ /*
+ * If the counter is still active, we need to retry the cross-call.
+ */
+ if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
+ spin_unlock_irq(&ctx->lock);
+ goto retry;
+ }
+
+ /*
+ * Since we have the lock this context can't be scheduled
+ * in, so we can change the state safely.
+ */
+ if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+ counter->state = PERF_COUNTER_STATE_OFF;
+
+ spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Disable a counter and all its children.
+ */
+static void perf_counter_disable_family(struct perf_counter *counter)
+{
+ struct perf_counter *child;
+
+ perf_counter_disable(counter);
+
+ /*
+ * Lock the mutex to protect the list of children
+ */
+ mutex_lock(&counter->mutex);
+ list_for_each_entry(child, &counter->child_list, child_list)
+ perf_counter_disable(child);
+ mutex_unlock(&counter->mutex);
+}
+
static int
counter_sched_in(struct perf_counter *counter,
struct perf_cpu_context *cpuctx,
struct perf_counter_context *ctx,
int cpu)
{
- if (counter->state == PERF_COUNTER_STATE_OFF)
+ if (counter->state <= PERF_COUNTER_STATE_OFF)
return 0;
counter->state = PERF_COUNTER_STATE_ACTIVE;
return -EAGAIN;
}
- cpuctx->active_oncpu++;
+ if (!is_software_counter(counter))
+ cpuctx->active_oncpu++;
ctx->nr_active++;
+ if (counter->hw_event.exclusive)
+ cpuctx->exclusive = 1;
+
return 0;
}
+/*
+ * Return 1 for a group consisting entirely of software counters,
+ * 0 if the group contains any hardware counters.
+ */
+static int is_software_only_group(struct perf_counter *leader)
+{
+ struct perf_counter *counter;
+
+ if (!is_software_counter(leader))
+ return 0;
+ list_for_each_entry(counter, &leader->sibling_list, list_entry)
+ if (!is_software_counter(counter))
+ return 0;
+ return 1;
+}
+
+/*
+ * Work out whether we can put this counter group on the CPU now.
+ */
+static int group_can_go_on(struct perf_counter *counter,
+ struct perf_cpu_context *cpuctx,
+ int can_add_hw)
+{
+ /*
+ * Groups consisting entirely of software counters can always go on.
+ */
+ if (is_software_only_group(counter))
+ return 1;
+ /*
+ * If an exclusive group is already on, no other hardware
+ * counters can go on.
+ */
+ if (cpuctx->exclusive)
+ return 0;
+ /*
+ * If this group is exclusive and there are already
+ * counters on the CPU, it can't go on.
+ */
+ if (counter->hw_event.exclusive && cpuctx->active_oncpu)
+ return 0;
+ /*
+ * Otherwise, try to add it if all previous groups were able
+ * to go on.
+ */
+ return can_add_hw;
+}
+
/*
* Cross CPU call to install and enable a performance counter
*/
struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
struct perf_counter *counter = info;
struct perf_counter_context *ctx = counter->ctx;
+ struct perf_counter *leader = counter->group_leader;
int cpu = smp_processor_id();
unsigned long flags;
u64 perf_flags;
+ int err;
/*
* If this is a task context, we need to check whether it is
list_add_counter(counter, ctx);
ctx->nr_counters++;
+ counter->prev_state = PERF_COUNTER_STATE_OFF;
+
+ /*
+ * Don't put the counter on if it is disabled or if
+ * it is in a group and the group isn't on.
+ */
+ if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
+ (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
+ goto unlock;
- counter_sched_in(counter, cpuctx, ctx, cpu);
+ /*
+ * An exclusive counter can't go on if there are already active
+ * hardware counters, and no hardware counter can go on if there
+ * is already an exclusive counter on.
+ */
+ if (!group_can_go_on(counter, cpuctx, 1))
+ err = -EEXIST;
+ else
+ err = counter_sched_in(counter, cpuctx, ctx, cpu);
+
+ if (err) {
+ /*
+ * This counter couldn't go on. If it is in a group
+ * then we have to pull the whole group off.
+ * If the counter group is pinned then put it in error state.
+ */
+ if (leader != counter)
+ group_sched_out(leader, cpuctx, ctx);
+ if (leader->hw_event.pinned)
+ leader->state = PERF_COUNTER_STATE_ERROR;
+ }
- if (!ctx->task && cpuctx->max_pertask)
+ if (!err && !ctx->task && cpuctx->max_pertask)
cpuctx->max_pertask--;
+ unlock:
hw_perf_restore(perf_flags);
spin_unlock(&ctx->lock);
* If the counter is attached to a task which is on a CPU we use a smp
* call to enable it in the task context. The task might have been
* scheduled away, but we check this in the smp call again.
+ *
+ * Must be called with ctx->mutex held.
*/
static void
perf_install_in_context(struct perf_counter_context *ctx,
{
struct task_struct *task = ctx->task;
- counter->ctx = ctx;
if (!task) {
/*
* Per cpu counters are installed via an smp call and
/*
* we need to retry the smp call.
*/
- if (ctx->nr_active && list_empty(&counter->list_entry)) {
+ if (ctx->is_active && list_empty(&counter->list_entry)) {
spin_unlock_irq(&ctx->lock);
goto retry;
}
spin_unlock_irq(&ctx->lock);
}
-static void
-counter_sched_out(struct perf_counter *counter,
- struct perf_cpu_context *cpuctx,
- struct perf_counter_context *ctx)
+/*
+ * Cross CPU call to enable a performance counter
+ */
+static void __perf_counter_enable(void *info)
{
- if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+ struct perf_counter *counter = info;
+ struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+ struct perf_counter_context *ctx = counter->ctx;
+ struct perf_counter *leader = counter->group_leader;
+ unsigned long flags;
+ int err;
+
+ /*
+ * If this is a per-task counter, need to check whether this
+ * counter's task is the current task on this cpu.
+ */
+ if (ctx->task && cpuctx->task_ctx != ctx)
return;
+ curr_rq_lock_irq_save(&flags);
+ spin_lock(&ctx->lock);
+
+ counter->prev_state = counter->state;
+ if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+ goto unlock;
counter->state = PERF_COUNTER_STATE_INACTIVE;
- counter->hw_ops->disable(counter);
- counter->oncpu = -1;
- cpuctx->active_oncpu--;
- ctx->nr_active--;
+ /*
+ * If the counter is in a group and isn't the group leader,
+ * then don't put it on unless the group is on.
+ */
+ if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
+ goto unlock;
+
+ if (!group_can_go_on(counter, cpuctx, 1))
+ err = -EEXIST;
+ else
+ err = counter_sched_in(counter, cpuctx, ctx,
+ smp_processor_id());
+
+ if (err) {
+ /*
+ * If this counter can't go on and it's part of a
+ * group, then the whole group has to come off.
+ */
+ if (leader != counter)
+ group_sched_out(leader, cpuctx, ctx);
+ if (leader->hw_event.pinned)
+ leader->state = PERF_COUNTER_STATE_ERROR;
+ }
+
+ unlock:
+ spin_unlock(&ctx->lock);
+ curr_rq_unlock_irq_restore(&flags);
}
-static void
-group_sched_out(struct perf_counter *group_counter,
- struct perf_cpu_context *cpuctx,
- struct perf_counter_context *ctx)
+/*
+ * Enable a counter.
+ */
+static void perf_counter_enable(struct perf_counter *counter)
{
- struct perf_counter *counter;
+ struct perf_counter_context *ctx = counter->ctx;
+ struct task_struct *task = ctx->task;
- counter_sched_out(group_counter, cpuctx, ctx);
+ if (!task) {
+ /*
+ * Enable the counter on the cpu that it's on
+ */
+ smp_call_function_single(counter->cpu, __perf_counter_enable,
+ counter, 1);
+ return;
+ }
+
+ spin_lock_irq(&ctx->lock);
+ if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+ goto out;
/*
- * Schedule out siblings (if any):
+ * If the counter is in error state, clear that first.
+ * That way, if we see the counter in error state below, we
+ * know that it has gone back into error state, as distinct
+ * from the task having been scheduled away before the
+ * cross-call arrived.
*/
- list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
- counter_sched_out(counter, cpuctx, ctx);
+ if (counter->state == PERF_COUNTER_STATE_ERROR)
+ counter->state = PERF_COUNTER_STATE_OFF;
+
+ retry:
+ spin_unlock_irq(&ctx->lock);
+ task_oncpu_function_call(task, __perf_counter_enable, counter);
+
+ spin_lock_irq(&ctx->lock);
+
+ /*
+ * If the context is active and the counter is still off,
+ * we need to retry the cross-call.
+ */
+ if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
+ goto retry;
+
+ /*
+ * Since we have the lock this context can't be scheduled
+ * in, so we can change the state safely.
+ */
+ if (counter->state == PERF_COUNTER_STATE_OFF)
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
+ out:
+ spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Enable a counter and all its children.
+ */
+static void perf_counter_enable_family(struct perf_counter *counter)
+{
+ struct perf_counter *child;
+
+ perf_counter_enable(counter);
+
+ /*
+ * Lock the mutex to protect the list of children
+ */
+ mutex_lock(&counter->mutex);
+ list_for_each_entry(child, &counter->child_list, child_list)
+ perf_counter_enable(child);
+ mutex_unlock(&counter->mutex);
}
void __perf_counter_sched_out(struct perf_counter_context *ctx,
struct perf_cpu_context *cpuctx)
{
struct perf_counter *counter;
+ u64 flags;
+ spin_lock(&ctx->lock);
+ ctx->is_active = 0;
if (likely(!ctx->nr_counters))
- return;
+ goto out;
- spin_lock(&ctx->lock);
+ flags = hw_perf_save_disable();
if (ctx->nr_active) {
list_for_each_entry(counter, &ctx->counter_list, list_entry)
group_sched_out(counter, cpuctx, ctx);
}
+ hw_perf_restore(flags);
+ out:
spin_unlock(&ctx->lock);
}
int cpu)
{
struct perf_counter *counter, *partial_group;
- int ret = 0;
+ int ret;
+
+ if (group_counter->state == PERF_COUNTER_STATE_OFF)
+ return 0;
+ ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
+ if (ret)
+ return ret < 0 ? ret : 0;
+
+ group_counter->prev_state = group_counter->state;
if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
return -EAGAIN;
* Schedule in siblings as one group (if any):
*/
list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+ counter->prev_state = counter->state;
if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
partial_group = counter;
goto group_error;
}
- ret = -EAGAIN;
}
- return ret;
+ return 0;
group_error:
/*
struct perf_cpu_context *cpuctx, int cpu)
{
struct perf_counter *counter;
+ u64 flags;
+ int can_add_hw = 1;
+ spin_lock(&ctx->lock);
+ ctx->is_active = 1;
if (likely(!ctx->nr_counters))
- return;
+ goto out;
- spin_lock(&ctx->lock);
+ flags = hw_perf_save_disable();
+
+ /*
+ * First go through the list and put on any pinned groups
+ * in order to give them the best chance of going on.
+ */
list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ if (counter->state <= PERF_COUNTER_STATE_OFF ||
+ !counter->hw_event.pinned)
+ continue;
+ if (counter->cpu != -1 && counter->cpu != cpu)
+ continue;
+
+ if (group_can_go_on(counter, cpuctx, 1))
+ group_sched_in(counter, cpuctx, ctx, cpu);
+
+ /*
+ * If this pinned group hasn't been scheduled,
+ * put it in error state.
+ */
+ if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+ counter->state = PERF_COUNTER_STATE_ERROR;
+ }
+
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ /*
+ * Ignore counters in OFF or ERROR state, and
+ * ignore pinned counters since we did them already.
+ */
+ if (counter->state <= PERF_COUNTER_STATE_OFF ||
+ counter->hw_event.pinned)
+ continue;
+
/*
* Listen to the 'cpu' scheduling filter constraint
* of counters:
if (counter->cpu != -1 && counter->cpu != cpu)
continue;
- /*
- * If we scheduled in a group atomically and
- * exclusively, break out:
- */
- if (group_sched_in(counter, cpuctx, ctx, cpu))
- break;
+ if (group_can_go_on(counter, cpuctx, can_add_hw)) {
+ if (group_sched_in(counter, cpuctx, ctx, cpu))
+ can_add_hw = 0;
+ }
}
+ hw_perf_restore(flags);
+ out:
spin_unlock(&ctx->lock);
}
*/
perf_flags = hw_perf_save_disable();
- list_for_each_entry(counter, &ctx->counter_list, list_entry)
- counter->state = PERF_COUNTER_STATE_OFF;
+ list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+ if (counter->state != PERF_COUNTER_STATE_ERROR)
+ counter->state = PERF_COUNTER_STATE_OFF;
+ }
hw_perf_restore(perf_flags);
perf_flags = hw_perf_save_disable();
list_for_each_entry(counter, &ctx->counter_list, list_entry) {
- if (counter->state != PERF_COUNTER_STATE_OFF)
+ if (counter->state > PERF_COUNTER_STATE_OFF)
continue;
counter->state = PERF_COUNTER_STATE_INACTIVE;
counter->hw_event.disabled = 0;
file->private_data = NULL;
+ mutex_lock(&ctx->mutex);
mutex_lock(&counter->mutex);
perf_counter_remove_from_context(counter);
- put_context(ctx);
mutex_unlock(&counter->mutex);
+ mutex_unlock(&ctx->mutex);
kfree(counter);
+ put_context(ctx);
return 0;
}
if (count != sizeof(cntval))
return -EINVAL;
+ /*
+ * Return end-of-file for a read on a counter that is in
+ * error state (i.e. because it was pinned but it couldn't be
+ * scheduled on to the CPU at some point).
+ */
+ if (counter->state == PERF_COUNTER_STATE_ERROR)
+ return 0;
+
mutex_lock(&counter->mutex);
cntval = perf_counter_read(counter);
mutex_unlock(&counter->mutex);
{
struct perf_data *irqdata, *usrdata;
DECLARE_WAITQUEUE(wait, current);
- ssize_t res;
+ ssize_t res, res2;
irqdata = counter->irqdata;
usrdata = counter->usrdata;
if (signal_pending(current))
break;
+ if (counter->state == PERF_COUNTER_STATE_ERROR)
+ break;
+
spin_unlock_irq(&counter->waitq.lock);
schedule();
spin_lock_irq(&counter->waitq.lock);
__set_current_state(TASK_RUNNING);
spin_unlock_irq(&counter->waitq.lock);
- if (usrdata->len + irqdata->len < count)
+ if (usrdata->len + irqdata->len < count &&
+ counter->state != PERF_COUNTER_STATE_ERROR)
return -ERESTARTSYS;
read_pending:
mutex_lock(&counter->mutex);
/* Switch irq buffer: */
usrdata = perf_switch_irq_data(counter);
- if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
+ res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
+ if (res2 < 0) {
if (!res)
res = -EFAULT;
} else {
- res = count;
+ res += res2;
}
out:
mutex_unlock(&counter->mutex);
return events;
}
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct perf_counter *counter = file->private_data;
+ int err = 0;
+
+ switch (cmd) {
+ case PERF_COUNTER_IOC_ENABLE:
+ perf_counter_enable_family(counter);
+ break;
+ case PERF_COUNTER_IOC_DISABLE:
+ perf_counter_disable_family(counter);
+ break;
+ default:
+ err = -ENOTTY;
+ }
+ return err;
+}
+
static const struct file_operations perf_fops = {
.release = perf_release,
.read = perf_read,
.poll = perf_poll,
+ .unlocked_ioctl = perf_ioctl,
+ .compat_ioctl = perf_ioctl,
};
static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
{
+ int cpu = raw_smp_processor_id();
+
+ atomic64_set(&counter->hw.prev_count, cpu_clock(cpu));
return 0;
}
+static void cpu_clock_perf_counter_update(struct perf_counter *counter)
+{
+ int cpu = raw_smp_processor_id();
+ s64 prev;
+ u64 now;
+
+ now = cpu_clock(cpu);
+ prev = atomic64_read(&counter->hw.prev_count);
+ atomic64_set(&counter->hw.prev_count, now);
+ atomic64_add(now - prev, &counter->count);
+}
+
static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
{
+ cpu_clock_perf_counter_update(counter);
}
static void cpu_clock_perf_counter_read(struct perf_counter *counter)
{
- int cpu = raw_smp_processor_id();
-
- atomic64_set(&counter->count, cpu_clock(cpu));
+ cpu_clock_perf_counter_update(counter);
}
static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
static int task_clock_perf_counter_enable(struct perf_counter *counter)
{
- u64 now = task_clock_perf_counter_val(counter, 0);
-
- atomic64_set(&counter->hw.prev_count, now);
+ if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+ atomic64_set(&counter->hw.prev_count,
+ task_clock_perf_counter_val(counter, 0));
return 0;
}
.read = task_clock_perf_counter_read,
};
-static u64 get_page_faults(void)
+#ifdef CONFIG_VM_EVENT_COUNTERS
+#define cpu_page_faults() __get_cpu_var(vm_event_states).event[PGFAULT]
+#else
+#define cpu_page_faults() 0
+#endif
+
+static u64 get_page_faults(struct perf_counter *counter)
{
- struct task_struct *curr = current;
+ struct task_struct *curr = counter->ctx->task;
- return curr->maj_flt + curr->min_flt;
+ if (curr)
+ return curr->maj_flt + curr->min_flt;
+ return cpu_page_faults();
}
static void page_faults_perf_counter_update(struct perf_counter *counter)
s64 delta;
prev = atomic64_read(&counter->hw.prev_count);
- now = get_page_faults();
+ now = get_page_faults(counter);
atomic64_set(&counter->hw.prev_count, now);
static int page_faults_perf_counter_enable(struct perf_counter *counter)
{
- /*
- * page-faults is a per-task value already,
- * so we dont have to clear it on switch-in.
- */
-
+ if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+ atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
return 0;
}
.read = page_faults_perf_counter_read,
};
-static u64 get_context_switches(void)
+static u64 get_context_switches(struct perf_counter *counter)
{
- struct task_struct *curr = current;
+ struct task_struct *curr = counter->ctx->task;
- return curr->nvcsw + curr->nivcsw;
+ if (curr)
+ return curr->nvcsw + curr->nivcsw;
+ return cpu_nr_switches(smp_processor_id());
}
static void context_switches_perf_counter_update(struct perf_counter *counter)
s64 delta;
prev = atomic64_read(&counter->hw.prev_count);
- now = get_context_switches();
+ now = get_context_switches(counter);
atomic64_set(&counter->hw.prev_count, now);
static int context_switches_perf_counter_enable(struct perf_counter *counter)
{
- /*
- * ->nvcsw + curr->nivcsw is a per-task value already,
- * so we dont have to clear it on switch-in.
- */
-
+ if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+ atomic64_set(&counter->hw.prev_count,
+ get_context_switches(counter));
return 0;
}
.read = context_switches_perf_counter_read,
};
-static inline u64 get_cpu_migrations(void)
+static inline u64 get_cpu_migrations(struct perf_counter *counter)
{
- return current->se.nr_migrations;
+ struct task_struct *curr = counter->ctx->task;
+
+ if (curr)
+ return curr->se.nr_migrations;
+ return cpu_nr_migrations(smp_processor_id());
}
static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
s64 delta;
prev = atomic64_read(&counter->hw.prev_count);
- now = get_cpu_migrations();
+ now = get_cpu_migrations(counter);
atomic64_set(&counter->hw.prev_count, now);
static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
{
- /*
- * se.nr_migrations is a per-task value already,
- * so we dont have to clear it on switch-in.
- */
-
+ if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+ atomic64_set(&counter->hw.prev_count,
+ get_cpu_migrations(counter));
return 0;
}
{
const struct hw_perf_counter_ops *hw_ops = NULL;
+ /*
+ * Software counters (currently) can't in general distinguish
+ * between user, kernel and hypervisor events.
+ * However, context switches and cpu migrations are considered
+ * to be kernel events, and page faults are never hypervisor
+ * events.
+ */
switch (counter->hw_event.type) {
case PERF_COUNT_CPU_CLOCK:
- hw_ops = &perf_ops_cpu_clock;
+ if (!(counter->hw_event.exclude_user ||
+ counter->hw_event.exclude_kernel ||
+ counter->hw_event.exclude_hv))
+ hw_ops = &perf_ops_cpu_clock;
break;
case PERF_COUNT_TASK_CLOCK:
- hw_ops = &perf_ops_task_clock;
+ if (counter->hw_event.exclude_user ||
+ counter->hw_event.exclude_kernel ||
+ counter->hw_event.exclude_hv)
+ break;
+ /*
+ * If the user instantiates this as a per-cpu counter,
+ * use the cpu_clock counter instead.
+ */
+ if (counter->ctx->task)
+ hw_ops = &perf_ops_task_clock;
+ else
+ hw_ops = &perf_ops_cpu_clock;
break;
case PERF_COUNT_PAGE_FAULTS:
- hw_ops = &perf_ops_page_faults;
+ if (!(counter->hw_event.exclude_user ||
+ counter->hw_event.exclude_kernel))
+ hw_ops = &perf_ops_page_faults;
break;
case PERF_COUNT_CONTEXT_SWITCHES:
- hw_ops = &perf_ops_context_switches;
+ if (!counter->hw_event.exclude_kernel)
+ hw_ops = &perf_ops_context_switches;
break;
case PERF_COUNT_CPU_MIGRATIONS:
- hw_ops = &perf_ops_cpu_migrations;
+ if (!counter->hw_event.exclude_kernel)
+ hw_ops = &perf_ops_cpu_migrations;
break;
default:
break;
static struct perf_counter *
perf_counter_alloc(struct perf_counter_hw_event *hw_event,
int cpu,
+ struct perf_counter_context *ctx,
struct perf_counter *group_leader,
gfp_t gfpflags)
{
INIT_LIST_HEAD(&counter->sibling_list);
init_waitqueue_head(&counter->waitq);
+ INIT_LIST_HEAD(&counter->child_list);
+
counter->irqdata = &counter->data[0];
counter->usrdata = &counter->data[1];
counter->cpu = cpu;
counter->wakeup_pending = 0;
counter->group_leader = group_leader;
counter->hw_ops = NULL;
+ counter->ctx = ctx;
counter->state = PERF_COUNTER_STATE_INACTIVE;
if (hw_event->disabled)
hw_ops = NULL;
if (!hw_event->raw && hw_event->type < 0)
hw_ops = sw_perf_counter_init(counter);
- if (!hw_ops)
+ else
hw_ops = hw_perf_counter_init(counter);
if (!hw_ops) {
* @cpu: target cpu
* @group_fd: group leader counter fd
*/
-asmlinkage int
-sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
- pid_t pid, int cpu, int group_fd)
+SYSCALL_DEFINE4(perf_counter_open,
+ const struct perf_counter_hw_event __user *, hw_event_uptr,
+ pid_t, pid, int, cpu, int, group_fd)
{
struct perf_counter *counter, *group_leader;
struct perf_counter_hw_event hw_event;
*/
if (group_leader->ctx != ctx)
goto err_put_context;
+ /*
+ * Only a group leader can be exclusive or pinned
+ */
+ if (hw_event.exclusive || hw_event.pinned)
+ goto err_put_context;
}
ret = -EINVAL;
- counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
+ counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
+ GFP_KERNEL);
if (!counter)
goto err_put_context;
goto err_free_put_context;
counter->filp = counter_file;
+ mutex_lock(&ctx->mutex);
perf_install_in_context(ctx, counter, cpu);
+ mutex_unlock(&ctx->mutex);
fput_light(counter_file, fput_needed2);
{
memset(ctx, 0, sizeof(*ctx));
spin_lock_init(&ctx->lock);
+ mutex_init(&ctx->mutex);
INIT_LIST_HEAD(&ctx->counter_list);
ctx->task = task;
}
/*
* inherit a counter from parent task to child task:
*/
-static int
+static struct perf_counter *
inherit_counter(struct perf_counter *parent_counter,
struct task_struct *parent,
struct perf_counter_context *parent_ctx,
struct task_struct *child,
+ struct perf_counter *group_leader,
struct perf_counter_context *child_ctx)
{
struct perf_counter *child_counter;
+ /*
+ * Instead of creating recursive hierarchies of counters,
+ * we link inherited counters back to the original parent,
+ * which has a filp for sure, which we use as the reference
+ * count:
+ */
+ if (parent_counter->parent)
+ parent_counter = parent_counter->parent;
+
child_counter = perf_counter_alloc(&parent_counter->hw_event,
- parent_counter->cpu, NULL,
- GFP_ATOMIC);
+ parent_counter->cpu, child_ctx,
+ group_leader, GFP_KERNEL);
if (!child_counter)
- return -ENOMEM;
+ return NULL;
/*
* Link it up in the child's context:
*/
- child_counter->ctx = child_ctx;
child_counter->task = child;
list_add_counter(child_counter, child_ctx);
child_ctx->nr_counters++;
*/
atomic_long_inc(&parent_counter->filp->f_count);
+ /*
+ * Link this into the parent counter's child list
+ */
+ mutex_lock(&parent_counter->mutex);
+ list_add_tail(&child_counter->child_list, &parent_counter->child_list);
+
+ /*
+ * Make the child state follow the state of the parent counter,
+ * not its hw_event.disabled bit. We hold the parent's mutex,
+ * so we won't race with perf_counter_{en,dis}able_family.
+ */
+ if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
+ child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+ else
+ child_counter->state = PERF_COUNTER_STATE_OFF;
+
+ mutex_unlock(&parent_counter->mutex);
+
+ return child_counter;
+}
+
+static int inherit_group(struct perf_counter *parent_counter,
+ struct task_struct *parent,
+ struct perf_counter_context *parent_ctx,
+ struct task_struct *child,
+ struct perf_counter_context *child_ctx)
+{
+ struct perf_counter *leader;
+ struct perf_counter *sub;
+
+ leader = inherit_counter(parent_counter, parent, parent_ctx,
+ child, NULL, child_ctx);
+ if (!leader)
+ return -ENOMEM;
+ list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
+ if (!inherit_counter(sub, parent, parent_ctx,
+ child, leader, child_ctx))
+ return -ENOMEM;
+ }
return 0;
}
+static void sync_child_counter(struct perf_counter *child_counter,
+ struct perf_counter *parent_counter)
+{
+ u64 parent_val, child_val;
+
+ parent_val = atomic64_read(&parent_counter->count);
+ child_val = atomic64_read(&child_counter->count);
+
+ /*
+ * Add back the child's count to the parent's count:
+ */
+ atomic64_add(child_val, &parent_counter->count);
+
+ /*
+ * Remove this counter from the parent's list
+ */
+ mutex_lock(&parent_counter->mutex);
+ list_del_init(&child_counter->child_list);
+ mutex_unlock(&parent_counter->mutex);
+
+ /*
+ * Release the parent counter, if this was the last
+ * reference to it.
+ */
+ fput(parent_counter->filp);
+}
+
static void
__perf_counter_exit_task(struct task_struct *child,
struct perf_counter *child_counter,
struct perf_counter_context *child_ctx)
{
struct perf_counter *parent_counter;
- u64 parent_val, child_val;
+ struct perf_counter *sub, *tmp;
/*
* If we do not self-reap then we have to wait for the
cpuctx = &__get_cpu_var(perf_cpu_context);
- if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
- child_counter->state = PERF_COUNTER_STATE_INACTIVE;
- child_counter->hw_ops->disable(child_counter);
- cpuctx->active_oncpu--;
- child_ctx->nr_active--;
- child_counter->oncpu = -1;
- }
+ group_sched_out(child_counter, cpuctx, child_ctx);
list_del_init(&child_counter->list_entry);
* that are still around due to the child reference. These
* counters need to be zapped - but otherwise linger.
*/
- if (!parent_counter)
- return;
-
- parent_val = atomic64_read(&parent_counter->count);
- child_val = atomic64_read(&child_counter->count);
-
- /*
- * Add back the child's count to the parent's count:
- */
- atomic64_add(child_val, &parent_counter->count);
-
- fput(parent_counter->filp);
-
- kfree(child_counter);
+ if (parent_counter) {
+ sync_child_counter(child_counter, parent_counter);
+ list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
+ list_entry) {
+ if (sub->parent) {
+ sync_child_counter(sub, sub->parent);
+ kfree(sub);
+ }
+ }
+ kfree(child_counter);
+ }
}
/*
- * When a child task exist, feed back counter values to parent counters.
+ * When a child task exits, feed back counter values to parent counters.
*
- * Note: we are running in child context, but the PID is not hashed
+ * Note: we may be running in child context, but the PID is not hashed
* anymore so new counters will not be added.
*/
void perf_counter_exit_task(struct task_struct *child)
void perf_counter_init_task(struct task_struct *child)
{
struct perf_counter_context *child_ctx, *parent_ctx;
- struct perf_counter *counter, *parent_counter;
+ struct perf_counter *counter;
struct task_struct *parent = current;
- unsigned long flags;
child_ctx = &child->perf_counter_ctx;
parent_ctx = &parent->perf_counter_ctx;
* Lock the parent list. No need to lock the child - not PID
* hashed yet and not running, so nobody can access it.
*/
- spin_lock_irqsave(&parent_ctx->lock, flags);
+ mutex_lock(&parent_ctx->mutex);
/*
* We dont have to disable NMIs - we are only looking at
* the list, not manipulating it:
*/
list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
- if (!counter->hw_event.inherit || counter->group_leader != counter)
+ if (!counter->hw_event.inherit)
continue;
- /*
- * Instead of creating recursive hierarchies of counters,
- * we link inheritd counters back to the original parent,
- * which has a filp for sure, which we use as the reference
- * count:
- */
- parent_counter = counter;
- if (counter->parent)
- parent_counter = counter->parent;
-
- if (inherit_counter(parent_counter, parent,
+ if (inherit_group(counter, parent,
parent_ctx, child, child_ctx))
break;
}
- spin_unlock_irqrestore(&parent_ctx->lock, flags);
+ mutex_unlock(&parent_ctx->mutex);
}
static void __cpuinit perf_counter_init_cpu(int cpu)
cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
mutex_unlock(&perf_resource_mutex);
- hw_perf_counter_setup();
+ hw_perf_counter_setup(cpu);
}
#ifdef CONFIG_HOTPLUG_CPU
list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
__perf_counter_remove_from_context(counter);
-
}
static void perf_counter_exit_cpu(int cpu)
{
+ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ struct perf_counter_context *ctx = &cpuctx->ctx;
+
+ mutex_lock(&ctx->mutex);
smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
+ mutex_unlock(&ctx->mutex);
}
#else
static inline void perf_counter_exit_cpu(int cpu) { }