From: Frederic Weisbecker on
On Fri, May 21, 2010 at 11:02:03AM +0200, Peter Zijlstra wrote:
> Avoid the swevent hash-table by using per-tracepoint hlists.
>
> Also, avoid conditionals on the fast path by ordering with probe unregister
> so that we should never get on the callback path without the data being there.
>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra(a)chello.nl>
> ---
> include/linux/ftrace_event.h | 16 ++---
> include/linux/perf_event.h | 6 +
> include/trace/ftrace.h | 4 -
> kernel/perf_event.c | 94 +++++++++++++++--------------
> kernel/trace/trace_event_perf.c | 127 +++++++++++++++++++++-------------------
> kernel/trace/trace_kprobe.c | 9 +-
> kernel/trace/trace_syscalls.c | 11 ++-
> 7 files changed, 143 insertions(+), 124 deletions(-)
>
> Index: linux-2.6/include/linux/ftrace_event.h
> ===================================================================
> --- linux-2.6.orig/include/linux/ftrace_event.h
> +++ linux-2.6/include/linux/ftrace_event.h
> @@ -133,7 +133,7 @@ struct ftrace_event_call {
> void *data;
>
> int perf_refcount;
> - void *perf_data;
> + struct hlist_head *perf_events;
> int (*perf_event_enable)(struct ftrace_event_call *);
> void (*perf_event_disable)(struct ftrace_event_call *);
> };
> @@ -192,9 +192,11 @@ struct perf_event;
>
> DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
>
> -extern int perf_trace_enable(int event_id, void *data);
> -extern void perf_trace_disable(int event_id);
> -extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
> +extern int perf_trace_init(struct perf_event *event);
> +extern void perf_trace_destroy(struct perf_event *event);
> +extern int perf_trace_enable(struct perf_event *event);
> +extern void perf_trace_disable(struct perf_event *event);
> +extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
> char *filter_str);
> extern void ftrace_profile_free_filter(struct perf_event *event);
> extern void *perf_trace_buf_prepare(int size, unsigned short type,
> @@ -202,11 +204,9 @@ extern void *perf_trace_buf_prepare(int
>
> static inline void
> perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
> - u64 count, struct pt_regs *regs, void *event)
> + u64 count, struct pt_regs *regs, void *head)
> {
> - struct trace_entry *entry = raw_data;
> -
> - perf_tp_event(entry->type, addr, count, raw_data, size, regs, event);
> + perf_tp_event(addr, count, raw_data, size, regs, head);
> perf_swevent_put_recursion_context(rctx);
> }
> #endif
> Index: linux-2.6/include/trace/ftrace.h
> ===================================================================
> --- linux-2.6.orig/include/trace/ftrace.h
> +++ linux-2.6/include/trace/ftrace.h
> @@ -768,6 +768,7 @@ perf_trace_templ_##call(struct ftrace_ev
> struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
> struct ftrace_raw_##call *entry; \
> u64 __addr = 0, __count = 1; \
> + struct hlist_head *head; \
> int __entry_size; \
> int __data_size; \
> int rctx; \
> @@ -790,8 +791,9 @@ perf_trace_templ_##call(struct ftrace_ev
> \
> { assign; } \
> \
> + head = per_cpu_ptr(event_call->perf_events, smp_processor_id());\



Should be rcu_dereference_sched ?



> perf_trace_buf_submit(entry, __entry_size, rctx, __addr, \
> - __count, __regs, event_call->perf_data); \
> + __count, __regs, head); \
> }
>
> #undef DEFINE_EVENT
> Index: linux-2.6/kernel/trace/trace_event_perf.c
> ===================================================================
> --- linux-2.6.orig/kernel/trace/trace_event_perf.c
> +++ linux-2.6/kernel/trace/trace_event_perf.c
> @@ -23,14 +23,25 @@ typedef typeof(unsigned long [PERF_MAX_T
> /* Count the events in use (per event id, not per instance) */
> static int total_ref_count;
>
> -static int perf_trace_event_enable(struct ftrace_event_call *event, void *data)
> +static int perf_trace_event_init(struct ftrace_event_call *tp_event,
> + struct perf_event *p_event)
> {
> + struct hlist_head *list;
> int ret = -ENOMEM;
> + int cpu;
>
> - if (event->perf_refcount++ > 0) {
> - event->perf_data = NULL;
> + p_event->tp_event = tp_event;
> + if (tp_event->perf_refcount++ > 0)
> return 0;
> - }
> +
> + list = alloc_percpu(struct hlist_head);
> + if (!list)
> + goto fail;
> +
> + for_each_possible_cpu(cpu)
> + INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
> +
> + tp_event->perf_events = list;



I suspect this must be rcu_assign_pointer.




>
> if (!total_ref_count) {
> char *buf;
> @@ -39,20 +50,20 @@ static int perf_trace_event_enable(struc
> for (i = 0; i < 4; i++) {
> buf = (char *)alloc_percpu(perf_trace_t);
> if (!buf)
> - goto fail_buf;
> + goto fail;
>
> - rcu_assign_pointer(perf_trace_buf[i], buf);
> + perf_trace_buf[i] = buf;
> }
> }
>
> - ret = event->perf_event_enable(event);
> - if (!ret) {
> - event->perf_data = data;
> - total_ref_count++;
> - return 0;
> - }
> + ret = tp_event->perf_event_enable(tp_event);
> + if (ret)
> + goto fail;
>
> -fail_buf:
> + total_ref_count++;
> + return 0;
> +
> +fail:
> if (!total_ref_count) {
> int i;
>
> @@ -61,21 +72,26 @@ fail_buf:
> perf_trace_buf[i] = NULL;
> }
> }
> - event->perf_refcount--;
> +
> + if (!--tp_event->perf_refcount) {
> + free_percpu(tp_event->perf_events);
> + tp_event->perf_events = NULL;
> + }
>
> return ret;
> }
>
> -int perf_trace_enable(int event_id, void *data)
> +int perf_trace_init(struct perf_event *p_event)
> {
> - struct ftrace_event_call *event;
> + struct ftrace_event_call *tp_event;
> + int event_id = p_event->attr.config;
> int ret = -EINVAL;
>
> mutex_lock(&event_mutex);
> - list_for_each_entry(event, &ftrace_events, list) {
> - if (event->id == event_id && event->perf_event_enable &&
> - try_module_get(event->mod)) {
> - ret = perf_trace_event_enable(event, data);
> + list_for_each_entry(tp_event, &ftrace_events, list) {
> + if (tp_event->id == event_id && tp_event->perf_event_enable &&
> + try_module_get(tp_event->mod)) {
> + ret = perf_trace_event_init(tp_event, p_event);
> break;
> }
> }
> @@ -84,53 +100,52 @@ int perf_trace_enable(int event_id, void
> return ret;
> }
>
> -static void perf_trace_event_disable(struct ftrace_event_call *event)
> +int perf_trace_enable(struct perf_event *p_event)
> {
> - if (--event->perf_refcount > 0)
> - return;
> + struct ftrace_event_call *tp_event = p_event->tp_event;
> + struct hlist_head *list;
>
> - event->perf_event_disable(event);
> + list = tp_event->perf_events;
> + if (WARN_ON_ONCE(!list))
> + return -EINVAL;
>
> - if (!--total_ref_count) {
> - char *buf[4];
> - int i;
> -
> - for (i = 0; i < 4; i++) {
> - buf[i] = perf_trace_buf[i];
> - rcu_assign_pointer(perf_trace_buf[i], NULL);
> - }
> + list = per_cpu_ptr(list, smp_processor_id());
> + hlist_add_head_rcu(&p_event->hlist_entry, list);



Ah and may be small comment, because using the hlist api here
may puzzle more people than just me ;)



> - /*
> - * Ensure every events in profiling have finished before
> - * releasing the buffers
> - */
> - synchronize_sched();
> + return 0;
> +}
>
> - for (i = 0; i < 4; i++)
> - free_percpu(buf[i]);
> - }
> +void perf_trace_disable(struct perf_event *p_event)
> +{
> + hlist_del_rcu(&p_event->hlist_entry);
> }
>
> -void perf_trace_disable(int event_id)
> +void perf_trace_destroy(struct perf_event *p_event)
> {
> - struct ftrace_event_call *event;
> + struct ftrace_event_call *tp_event = p_event->tp_event;
> + int i;
>
> - mutex_lock(&event_mutex);
> - list_for_each_entry(event, &ftrace_events, list) {
> - if (event->id == event_id) {
> - perf_trace_event_disable(event);
> - module_put(event->mod);
> - break;
> + if (--tp_event->perf_refcount > 0)
> + return;
> +
> + tp_event->perf_event_disable(tp_event);



Don't we need a rcu_synchronize_sched() here?



> + free_percpu(tp_event->perf_events);
> + tp_event->perf_events = NULL;



And rcu_assign?



> +
> + if (!--total_ref_count) {
> + for (i = 0; i < 4; i++) {
> + free_percpu(perf_trace_buf[i]);
> + perf_trace_buf[i] = NULL;
> }
> }
> - mutex_unlock(&event_mutex);
> }
>
> __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
> struct pt_regs *regs, int *rctxp)
> {
> struct trace_entry *entry;
> - char *trace_buf, *raw_data;
> + char *raw_data;
> int pc;
>
> BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
> @@ -139,13 +154,9 @@ __kprobes void *perf_trace_buf_prepare(i
>
> *rctxp = perf_swevent_get_recursion_context();
> if (*rctxp < 0)
> - goto err_recursion;
> -
> - trace_buf = rcu_dereference_sched(perf_trace_buf[*rctxp]);
> - if (!trace_buf)
> - goto err;
> + return NULL;
>
> - raw_data = per_cpu_ptr(trace_buf, smp_processor_id());
> + raw_data = per_cpu_ptr(perf_trace_buf[*rctxp], smp_processor_id());



Needs rcu_dereference_sched too. And this could be __this_cpu_var()




>
> /* zero the dead bytes from align to not leak stack to user */
> memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
> @@ -155,9 +166,5 @@ __kprobes void *perf_trace_buf_prepare(i
> entry->type = type;
>
> return raw_data;
> -err:
> - perf_swevent_put_recursion_context(*rctxp);
> -err_recursion:
> - return NULL;
> }
> EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
> Index: linux-2.6/kernel/trace/trace_kprobe.c
> ===================================================================
> --- linux-2.6.orig/kernel/trace/trace_kprobe.c
> +++ linux-2.6/kernel/trace/trace_kprobe.c
> @@ -1341,6 +1341,7 @@ static __kprobes void kprobe_perf_func(s
> struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
> struct ftrace_event_call *call = &tp->call;
> struct kprobe_trace_entry_head *entry;
> + struct hlist_head *head;
> u8 *data;
> int size, __size, i;
> int rctx;
> @@ -1361,7 +1362,8 @@ static __kprobes void kprobe_perf_func(s
> for (i = 0; i < tp->nr_args; i++)
> call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
>
> - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, call->perf_data);
> + head = per_cpu_ptr(call->perf_events, smp_processor_id());



Same



> + perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
> }
>
> /* Kretprobe profile handler */
> @@ -1371,6 +1373,7 @@ static __kprobes void kretprobe_perf_fun
> struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
> struct ftrace_event_call *call = &tp->call;
> struct kretprobe_trace_entry_head *entry;
> + struct hlist_head *head;
> u8 *data;
> int size, __size, i;
> int rctx;
> @@ -1392,8 +1395,8 @@ static __kprobes void kretprobe_perf_fun
> for (i = 0; i < tp->nr_args; i++)
> call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
>
> - perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
> - regs, call->perf_data);
> + head = per_cpu_ptr(call->perf_events, smp_processor_id());



ditto



> + perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
> }
>
> static int probe_perf_enable(struct ftrace_event_call *call)
> Index: linux-2.6/kernel/trace/trace_syscalls.c
> ===================================================================
> --- linux-2.6.orig/kernel/trace/trace_syscalls.c
> +++ linux-2.6/kernel/trace/trace_syscalls.c
> @@ -438,6 +438,7 @@ static void perf_syscall_enter(struct pt
> {
> struct syscall_metadata *sys_data;
> struct syscall_trace_enter *rec;
> + struct hlist_head *head;
> int syscall_nr;
> int rctx;
> int size;
> @@ -467,8 +468,9 @@ static void perf_syscall_enter(struct pt
> rec->nr = syscall_nr;
> syscall_get_arguments(current, regs, 0, sys_data->nb_args,
> (unsigned long *)&rec->args);
> - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs,
> - sys_data->enter_event->perf_data);
> +
> + head = per_cpu_ptr(sys_data->enter_event->perf_events, smp_processor_id());



ditto



> + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
> }
>
> int perf_sysenter_enable(struct ftrace_event_call *call)
> @@ -510,6 +512,7 @@ static void perf_syscall_exit(struct pt_
> {
> struct syscall_metadata *sys_data;
> struct syscall_trace_exit *rec;
> + struct hlist_head *head;
> int syscall_nr;
> int rctx;
> int size;
> @@ -542,8 +545,8 @@ static void perf_syscall_exit(struct pt_
> rec->nr = syscall_nr;
> rec->ret = syscall_get_return_value(current, regs);
>
> - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs,
> - sys_data->exit_event->perf_data);
> + head = per_cpu_ptr(sys_data->exit_event->perf_events, smp_processor_id());



and ditto :)



> + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
> }
>
> int perf_sysexit_enable(struct ftrace_event_call *call)
> Index: linux-2.6/kernel/perf_event.c
> ===================================================================
> --- linux-2.6.orig/kernel/perf_event.c
> +++ linux-2.6/kernel/perf_event.c
> @@ -4004,9 +4004,6 @@ static void perf_swevent_add(struct perf
> perf_swevent_overflow(event, 0, nmi, data, regs);
> }
>
> -static int perf_tp_event_match(struct perf_event *event,
> - struct perf_sample_data *data);
> -
> static int perf_exclude_event(struct perf_event *event,
> struct pt_regs *regs)
> {
> @@ -4036,10 +4033,6 @@ static int perf_swevent_match(struct per
> if (perf_exclude_event(event, regs))
> return 0;
>
> - if (event->attr.type == PERF_TYPE_TRACEPOINT &&
> - !perf_tp_event_match(event, data))
> - return 0;
> -
> return 1;
> }
>
> @@ -4094,7 +4087,7 @@ end:
>
> int perf_swevent_get_recursion_context(void)
> {
> - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
> + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
> int rctx;
>
> if (in_nmi())
> @@ -4106,10 +4099,8 @@ int perf_swevent_get_recursion_context(v
> else
> rctx = 0;
>
> - if (cpuctx->recursion[rctx]) {
> - put_cpu_var(perf_cpu_context);
> + if (cpuctx->recursion[rctx])
> return -1;
> - }
>
> cpuctx->recursion[rctx]++;
> barrier();
> @@ -4123,7 +4114,6 @@ void perf_swevent_put_recursion_context(
> struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
> barrier();
> cpuctx->recursion[rctx]--;
> - put_cpu_var(perf_cpu_context);
> }
> EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
>
> @@ -4134,6 +4124,7 @@ void __perf_sw_event(u32 event_id, u64 n
> struct perf_sample_data data;
> int rctx;
>
> + preempt_disable_notrace();


Why is this needed. We have the recursion context protection already.


Ok, otherwise looks good.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Frederic Weisbecker on
On Fri, May 21, 2010 at 12:02:05PM +0200, Peter Zijlstra wrote:
> On Fri, 2010-05-21 at 11:40 +0200, Frederic Weisbecker wrote:
> > On Fri, May 21, 2010 at 11:02:03AM +0200, Peter Zijlstra wrote:
>
> > > Also, avoid conditionals on the fast path by ordering with probe unregister
> > > so that we should never get on the callback path without the data being there.
> > >
> > \
> > > + head = per_cpu_ptr(event_call->perf_events, smp_processor_id());\
>
> > Should be rcu_dereference_sched ?
>
> No, I removed all that rcu stuff and synchronized against the probe
> unregister.
>
> I assumed that after probe unregister a tracepoint callback doesn't
> happen, which then guarantees we should never get !head.



I'm not sure about this. The tracepoints are called under rcu_read_lock(),
but there is not synchronize_rcu() after we unregister a tracepoint, which
means you can have a pending preempted one somewhere.

There is a call_rcu that removes the callbacks, but that only protect
the callback themselves.



>
> > > + for_each_possible_cpu(cpu)
> > > + INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
> > > +
> > > + tp_event->perf_events = list;
> >
> >
> >
> > I suspect this must be rcu_assign_pointer.
>
> Same thing as above, I do this before probe register, so I see no need
> for RCU.
>
> > > + list = per_cpu_ptr(list, smp_processor_id());
> > > + hlist_add_head_rcu(&p_event->hlist_entry, list);
> >
> >
> >
> > Ah and may be small comment, because using the hlist api here
> > may puzzle more people than just me ;)
>
> What exactly is the puzzlement about?



The fact we use the hlist API not for hlist purpose but for a list.




> > > + if (--tp_event->perf_refcount > 0)
> > > + return;
> > > +
> > > + tp_event->perf_event_disable(tp_event);
> >
> >
> >
> > Don't we need a rcu_synchronize_sched() here?
>
> Doesn't probe unregister synchronize things against its own callback?



May be I missed it but it doesn't seem so.



> > > + raw_data = per_cpu_ptr(perf_trace_buf[*rctxp], smp_processor_id());
> >
> >
> >
> > Needs rcu_dereference_sched too. And this could be __this_cpu_var()
>
> Ahh! so that is what its called.


:)



> > > + preempt_disable_notrace();
> >
> >
> > Why is this needed. We have the recursion context protection already.
>
> Because:
>
> @@ -4094,7 +4087,7 @@ end:
>
> int perf_swevent_get_recursion_context(void)
> {
> - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
> + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
> int rctx;
>
> if (in_nmi())
>



Right.

Thanks.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Frederic Weisbecker on
On Fri, May 21, 2010 at 12:19:09PM +0200, Peter Zijlstra wrote:
> On Fri, 2010-05-21 at 12:13 +0200, Frederic Weisbecker wrote:
> > > I assumed that after probe unregister a tracepoint callback doesn't
> > > happen, which then guarantees we should never get !head.
>
> > I'm not sure about this. The tracepoints are called under rcu_read_lock(),
> > but there is not synchronize_rcu() after we unregister a tracepoint, which
> > means you can have a pending preempted one somewhere.
> >
> > There is a call_rcu that removes the callbacks, but that only protect
> > the callback themselves.
>
> Ah, ok, so we should do probe_unregister + synchronize_sched().
> That should ensure __DO_TRACE() doesn't call into it anymore.
>
> /me goes make a patch
>


Yep. But that also means we need to rcu_dereference_sched() to access
the per cpu list of events.

Thanks.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Frederic Weisbecker on
On Fri, May 21, 2010 at 12:34:03PM +0200, Peter Zijlstra wrote:
> On Fri, 2010-05-21 at 12:21 +0200, Frederic Weisbecker wrote:
> > On Fri, May 21, 2010 at 12:19:09PM +0200, Peter Zijlstra wrote:
> > > On Fri, 2010-05-21 at 12:13 +0200, Frederic Weisbecker wrote:
> > > > > I assumed that after probe unregister a tracepoint callback doesn't
> > > > > happen, which then guarantees we should never get !head.
> > >
> > > > I'm not sure about this. The tracepoints are called under rcu_read_lock(),
> > > > but there is not synchronize_rcu() after we unregister a tracepoint, which
> > > > means you can have a pending preempted one somewhere.
> > > >
> > > > There is a call_rcu that removes the callbacks, but that only protect
> > > > the callback themselves.
> > >
> > > Ah, ok, so we should do probe_unregister + synchronize_sched().
> > > That should ensure __DO_TRACE() doesn't call into it anymore.
> > >
> > > /me goes make a patch
> > >
> >
> >
> > Yep. But that also means we need to rcu_dereference_sched() to access
> > the per cpu list of events.
>
> Why?
>
> The per-cpu vars are allocated and freed in a fully serialized manner,
> there should be no races what so ever.


Ah right, I was confused.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/