From: Borislav Petkov on
From: Borislav Petkov <bp(a)amd64.org>
Date: Sat, May 22, 2010 at 09:04:47PM +0200

> Register and enable events marked as persistent right after perf events
> has initialized.
>
> Not-yet-signed-off-by: Borislav Petkov <bp(a)alien8.de>
> ---
> include/linux/ftrace_event.h | 10 +++++++
> include/linux/perf_event.h | 1 +
> kernel/perf_event.c | 59 +++++++++++++++++++++++++++++++++++++----
> kernel/trace/trace.h | 1 -
> 4 files changed, 64 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
> index c0f4b36..b40d637 100644
> --- a/include/linux/ftrace_event.h
> +++ b/include/linux/ftrace_event.h
> @@ -13,6 +13,8 @@ struct dentry;
>
> DECLARE_PER_CPU(struct trace_seq, ftrace_event_seq);
>
> +extern struct list_head ftrace_events;
> +
> struct trace_print_flags {
> unsigned long mask;
> const char *name;
> @@ -134,6 +136,7 @@ struct ftrace_event_call {
> int perf_refcount;
> int (*perf_event_enable)(struct ftrace_event_call *);
> void (*perf_event_disable)(struct ftrace_event_call *);
> + unsigned int type;
> };
>
> #define PERF_MAX_TRACE_SIZE 2048
> @@ -155,6 +158,13 @@ enum {
> FILTER_PTR_STRING,
> };
>
> +enum event_type_t {
> + EVENT_FLEXIBLE = 0x1,
> + EVENT_PINNED = 0x2,
> + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
> + EVENT_PERSISTENT = 0x3,
> +};

Doh,

I meant

enum event_type_t {
EVENT_FLEXIBLE = 0x1,
EVENT_PINNED = 0x2,
EVENT_PERSISTENT = 0x4,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED | EVENT_PERSISTENT,
};

here.

> +
> extern int trace_event_raw_init(struct ftrace_event_call *call);
> extern int trace_define_field(struct ftrace_event_call *call, const char *type,
> const char *name, int offset, int size,
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index c8e3754..aa62c97 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -579,6 +579,7 @@ struct perf_event {
> struct list_head group_entry;
> struct list_head event_entry;
> struct list_head sibling_list;
> + struct list_head pevent_entry;
> int nr_siblings;
> int group_flags;
> struct perf_event *group_leader;
> diff --git a/kernel/perf_event.c b/kernel/perf_event.c
> index 3d1552d..84f2f36 100644
> --- a/kernel/perf_event.c
> +++ b/kernel/perf_event.c
> @@ -72,6 +72,11 @@ static atomic64_t perf_event_id;
> static DEFINE_SPINLOCK(perf_resource_lock);
>
> /*
> + * persistent events which are always on
> + */
> +DEFINE_PER_CPU(struct list_head, persistent_events);
> +
> +/*
> * Architecture provided APIs - weak aliases:
> */
> extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
> @@ -1017,12 +1022,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
> return 0;
> }
>
> -enum event_type_t {
> - EVENT_FLEXIBLE = 0x1,
> - EVENT_PINNED = 0x2,
> - EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
> -};
> -
> static void ctx_sched_out(struct perf_event_context *ctx,
> struct perf_cpu_context *cpuctx,
> enum event_type_t event_type)
> @@ -5385,6 +5384,8 @@ static void __init perf_event_init_all_cpus(void)
> for_each_possible_cpu(cpu) {
> cpuctx = &per_cpu(perf_cpu_context, cpu);
> __perf_event_init_context(&cpuctx->ctx, NULL);
> +
> + INIT_LIST_HEAD(&per_cpu(persistent_events, cpu));
> }
> }
>
> @@ -5405,12 +5406,16 @@ static void __perf_event_exit_cpu(void *info)
> struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
> struct perf_event_context *ctx = &cpuctx->ctx;
> struct perf_event *event, *tmp;
> + struct list_head *pers_events_list = &__get_cpu_var(persistent_events);
>
> list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
> __perf_event_remove_from_context(event);
> list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
> __perf_event_remove_from_context(event);
> + list_for_each_entry_safe(event, tmp, pers_events_list, pevent_entry)
> + __perf_event_remove_from_context(event);
> }
> +
> static void perf_event_exit_cpu(int cpu)
> {
> struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
> @@ -5456,6 +5461,46 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
> .priority = 20,
> };
>
> +static void __init perf_init_persistent_events(void)
> +{
> +
> + struct ftrace_event_call *call;
> + struct perf_event_attr attr;
> + struct perf_event *event;
> + int cpu;
> +
> + list_for_each_entry(call, &ftrace_events, list) {
> +
> + if (call->type != EVENT_PERSISTENT)
> + continue;
> +
> + attr.type = PERF_TYPE_TRACEPOINT,
> + attr.config = call->id,
> + attr.size = sizeof(attr),
> +
> + get_online_cpus();
> +
> + for_each_online_cpu(cpu) {
> + struct list_head *list;
> +
> + event = perf_event_create_kernel_counter(&attr, cpu, -1, NULL);
> + if (IS_ERR(event)) {
> + printk(KERN_ERR "Error initializing persistent "
> + "event %s on cpu %d\n",
> + call->name, cpu);
> + break;
> + }
> +
> + list = &per_cpu(persistent_events, cpu);
> + list_add(&event->pevent_entry, list);
> +
> + perf_event_enable(event);
> +
> + }
> + put_online_cpus();
> + }
> +}
> +
> void __init perf_event_init(void)
> {
> perf_event_init_all_cpus();
> @@ -5464,6 +5509,8 @@ void __init perf_event_init(void)
> perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
> (void *)(long)smp_processor_id());
> register_cpu_notifier(&perf_cpu_nb);
> +
> + perf_init_persistent_events();
> }
>
> static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 2825ef2..95f5611 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -786,7 +786,6 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
> }
>
> extern struct mutex event_mutex;
> -extern struct list_head ftrace_events;
>
> extern const char *__start___trace_bprintk_fmt[];
> extern const char *__stop___trace_bprintk_fmt[];
> --
> 1.7.1
>
>

--
Regards/Gruss,
Boris.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Peter Zijlstra on
On Sat, 2010-05-22 at 21:00 +0200, Borislav Petkov wrote:
> Register and enable events marked as persistent right after perf events
> has initialized.
>
> Not-yet-signed-off-by: Borislav Petkov <bp(a)alien8.de>

Nah, this is totally wrong.

A persistent event would simply be a regular event, but created by the
kernel and not tied to a file-desc's lifetime.



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Borislav Petkov on
From: Peter Zijlstra <peterz(a)infradead.org>
Date: Sun, May 23, 2010 at 08:15:13PM +0200

> On Sat, 2010-05-22 at 21:00 +0200, Borislav Petkov wrote:
> > Register and enable events marked as persistent right after perf events
> > has initialized.
> >
> > Not-yet-signed-off-by: Borislav Petkov <bp(a)alien8.de>
>
> Nah, this is totally wrong.
>
> A persistent event would simply be a regular event, but created by the
> kernel and not tied to a file-desc's lifetime.

So you're saying the trace_mce_record() tracepoint for example should
be created completely internally in the kernel and cease to be a
tracepoint? Will it still be able to be selected by perf -e?

Please elaborate.

--
Regards/Gruss,
Boris.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Peter Zijlstra on
On Sun, 2010-05-23 at 20:33 +0200, Borislav Petkov wrote:
> From: Peter Zijlstra <peterz(a)infradead.org>
> Date: Sun, May 23, 2010 at 08:15:13PM +0200
>
> > On Sat, 2010-05-22 at 21:00 +0200, Borislav Petkov wrote:
> > > Register and enable events marked as persistent right after perf events
> > > has initialized.
> > >
> > > Not-yet-signed-off-by: Borislav Petkov <bp(a)alien8.de>
> >
> > Nah, this is totally wrong.
> >
> > A persistent event would simply be a regular event, but created by the
> > kernel and not tied to a file-desc's lifetime.
>
> So you're saying the trace_mce_record() tracepoint for example should
> be created completely internally in the kernel and cease to be a
> tracepoint? Will it still be able to be selected by perf -e?

No, it should be a regular tracepoint as far as tracepoints are
concerned.

But the only thing persistence should add is an instance of a
perf_event, it should not modify either the perf_event nor the
tracepoint code.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Borislav Petkov on
From: Peter Zijlstra <peterz(a)infradead.org>
Date: Sun, May 23, 2010 at 08:40:47PM +0200

> > > A persistent event would simply be a regular event, but created by the
> > > kernel and not tied to a file-desc's lifetime.
> >
> > So you're saying the trace_mce_record() tracepoint for example should
> > be created completely internally in the kernel and cease to be a
> > tracepoint? Will it still be able to be selected by perf -e?
>
> No, it should be a regular tracepoint as far as tracepoints are
> concerned.
>
> But the only thing persistence should add is an instance of a
> perf_event, it should not modify either the perf_event nor the
> tracepoint code.

which means that subsystems which initialize earlier than perf (mce,
for example) should have to be notified when perf is ready so that they
could register a persistent event. How does that sound?

--
Regards/Gruss,
Boris.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/