From: Dominik Brodowski on
On Mon, Apr 05, 2010 at 03:31:30PM -0700, Paul E. McKenney wrote:
> > > 5) powertop and hrtimer_start_range_ns (tick_sched_timer) on a SMP kernel
> > > booted with "nosmp":
> > >
> > > Wakeups-from-idle per second : 9.9 interval: 15.0s
> > > ...
> > > 48.5% ( 9.4) <kernel core> : hrtimer_start (tick_sched_timer)
> > > 26.1% ( 5.1) <kernel core> : cursor_timer_handler
> > > (cursor_timer_handle
> > > 20.6% ( 4.0) <kernel core> : usb_hcd_poll_rh_status (rh_timer_func)
> > > 1.0% ( 0.2) <kernel core> : arm_supers_timer
> > > (sync_supers_timer_fn)
> > > 0.7% ( 0.1) <interrupt> : ata_piix
> > > ...
> > >
> > > Accoding to http://www.linuxpowertop.org , the count in the brackets is
> > > how
> > > many wakeups per seconds were caused by one source. Adding all _except_
> > > 48.5% ( 9.4) <kernel core> : hrtimer_start (tick_sched_timer)
> > > up leads to the 9.9.
>
> OK, so you further instrumented the hrtimer_sched_timer (or was it
> tick_sched_timer?) to find the number that you were attributing to
> rcu_needs_cpu()?

That's what I did -- to tick_nohz_stop_sched_tick(), to see why the
tick_sched_timer did not get stopped. Or why powertop thinks it did not
get stopped...

Patch below (works only for 1 or 2 CPUs) for all interested parties,
_NOT_ intended for submission, though ;)

0x0001: CPU #1
0x0002: inidle
0x0004: not inidle
0x0008: INACTIVE
0x0010: need_resched
0x0020: softirq
0x0040: rcu_needs_
0x0080: printk_needs_
0x0100: arch_needs_
0x0200: off-by-one
0x0400: skip
0x0800: loadbalance:
0x1000: cancel timer (SUCCESS)
0x2000: timer was already in the past
0x4000: hrtimer_start() (SUCCESS)
0x8000: stop_tick (SUCCESS)

Fixing USB-autosuspend and disabling the fbcon cursor, I'm down to
~0.9 wakeups-per-second booting with "nosmp".

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762..6ba7bae 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -210,19 +210,35 @@ void tick_nohz_stop_sched_tick(int inidle)
struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
u64 time_delta;
int cpu;
+ static int testrun_counter = 0;
+ int testrun = 0;
+ u64 debugdata = 0;
+
+ testrun_counter++;
+
+ if ((testrun_counter % 123) == 0)
+ testrun = 1;

local_irq_save(flags);

cpu = smp_processor_id();
ts = &per_cpu(tick_cpu_sched, cpu);

+ debugdata = cpu;
+
+ if (inidle)
+ debugdata |= 0x2;
+
+
/*
* Call to tick_nohz_start_idle stops the last_update_time from being
* updated. Thus, it must not be called in the event we are called from
* irq_exit() with the prior state different than idle.
*/
- if (!inidle && !ts->inidle)
+ if (!inidle && !ts->inidle) {
+ debugdata |= 0x4;
goto end;
+ }

/*
* Set ts->inidle unconditionally. Even if the system did not
@@ -245,11 +261,15 @@ void tick_nohz_stop_sched_tick(int inidle)
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
}

- if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
+ if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
+ debugdata |= 0x8;
goto end;
+ }

- if (need_resched())
+ if (need_resched()) {
+ debugdata |= 0x10;
goto end;
+ }

if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
static int ratelimit;
@@ -259,6 +279,7 @@ void tick_nohz_stop_sched_tick(int inidle)
(unsigned int) local_softirq_pending());
ratelimit++;
}
+ debugdata |= 0x20;
goto end;
}

@@ -275,6 +296,12 @@ void tick_nohz_stop_sched_tick(int inidle)
arch_needs_cpu(cpu)) {
next_jiffies = last_jiffies + 1;
delta_jiffies = 1;
+ if (rcu_needs_cpu(cpu))
+ debugdata |= 0x40;
+ if (printk_needs_cpu(cpu))
+ debugdata |= 0x80;
+ if (arch_needs_cpu(cpu))
+ debugdata |= 0x100;
} else {
/* Get the next timer wheel timer */
next_jiffies = get_next_timer_interrupt(last_jiffies);
@@ -284,8 +311,10 @@ void tick_nohz_stop_sched_tick(int inidle)
* Do not stop the tick, if we are only one off
* or if the cpu is required for rcu
*/
- if (!ts->tick_stopped && delta_jiffies == 1)
+ if (!ts->tick_stopped && delta_jiffies == 1) {
+ debugdata |= 0x200;
goto out;
+ }

/* Schedule the tick, if we are at least one jiffie off */
if ((long)delta_jiffies >= 1) {
@@ -341,8 +370,10 @@ void tick_nohz_stop_sched_tick(int inidle)
cpumask_set_cpu(cpu, nohz_cpu_mask);

/* Skip reprogram of event if its not changed */
- if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
+ if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) {
+ debugdata |= 0x400;
goto out;
+ }

/*
* nohz_stop_sched_tick can be called several times before
@@ -357,8 +388,10 @@ void tick_nohz_stop_sched_tick(int inidle)
* sched tick not stopped!
*/
cpumask_clear_cpu(cpu, nohz_cpu_mask);
+ debugdata |= 0x800;
goto out;
}
+ debugdata |= 0x8000;

ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
@@ -375,7 +408,8 @@ void tick_nohz_stop_sched_tick(int inidle)
* If the expiration time == KTIME_MAX, then
* in this case we simply stop the tick timer.
*/
- if (unlikely(expires.tv64 == KTIME_MAX)) {
+ if (unlikely(expires.tv64 == KTIME_MAX)) {
+ debugdata |= 0x1000;
if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
hrtimer_cancel(&ts->sched_timer);
goto out;
@@ -384,9 +418,13 @@ void tick_nohz_stop_sched_tick(int inidle)
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
hrtimer_start(&ts->sched_timer, expires,
HRTIMER_MODE_ABS_PINNED);
+ debugdata |= 0x2000;
/* Check, if the timer was already in the past */
- if (hrtimer_active(&ts->sched_timer))
+ if (hrtimer_active(&ts->sched_timer)) {
+ debugdata &= ~(0x2000);
+ debugdata |= 0x4000;
goto out;
+ }
} else if (!tick_program_event(expires, 0))
goto out;
/*
@@ -403,6 +441,8 @@ out:
ts->last_jiffies = last_jiffies;
ts->sleep_length = ktime_sub(dev->next_event, now);
end:
+ if (testrun)
+ printk(KERN_DEBUG "0x%0llx\n", debugdata);
local_irq_restore(flags);
}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Paul E. McKenney on
On Tue, Apr 06, 2010 at 10:45:37PM +0200, Dominik Brodowski wrote:
> On Mon, Apr 05, 2010 at 03:31:30PM -0700, Paul E. McKenney wrote:
> > > > 5) powertop and hrtimer_start_range_ns (tick_sched_timer) on a SMP kernel
> > > > booted with "nosmp":
> > > >
> > > > Wakeups-from-idle per second : 9.9 interval: 15.0s
> > > > ...
> > > > 48.5% ( 9.4) <kernel core> : hrtimer_start (tick_sched_timer)
> > > > 26.1% ( 5.1) <kernel core> : cursor_timer_handler
> > > > (cursor_timer_handle
> > > > 20.6% ( 4.0) <kernel core> : usb_hcd_poll_rh_status (rh_timer_func)
> > > > 1.0% ( 0.2) <kernel core> : arm_supers_timer
> > > > (sync_supers_timer_fn)
> > > > 0.7% ( 0.1) <interrupt> : ata_piix
> > > > ...
> > > >
> > > > Accoding to http://www.linuxpowertop.org , the count in the brackets is
> > > > how
> > > > many wakeups per seconds were caused by one source. Adding all _except_
> > > > 48.5% ( 9.4) <kernel core> : hrtimer_start (tick_sched_timer)
> > > > up leads to the 9.9.
> >
> > OK, so you further instrumented the hrtimer_sched_timer (or was it
> > tick_sched_timer?) to find the number that you were attributing to
> > rcu_needs_cpu()?
>
> That's what I did -- to tick_nohz_stop_sched_tick(), to see why the
> tick_sched_timer did not get stopped. Or why powertop thinks it did not
> get stopped...
>
> Patch below (works only for 1 or 2 CPUs) for all interested parties,
> _NOT_ intended for submission, though ;)

Cool!

If you ever do want to submit such a patch, I should give you multiple
non-zero return values from rcu_needs_cpu() to allow distinguishing between
the following:

1. RCU needs the CPU, but is trying to get done with it quickly.

2. RCU needs the CPU, and has given up on trying to get done with
it quickly.

Unless you tell me otherwise, I will assume that you do -not- need this.

Thanx, Paul

> 0x0001: CPU #1
> 0x0002: inidle
> 0x0004: not inidle
> 0x0008: INACTIVE
> 0x0010: need_resched
> 0x0020: softirq
> 0x0040: rcu_needs_
> 0x0080: printk_needs_
> 0x0100: arch_needs_
> 0x0200: off-by-one
> 0x0400: skip
> 0x0800: loadbalance:
> 0x1000: cancel timer (SUCCESS)
> 0x2000: timer was already in the past
> 0x4000: hrtimer_start() (SUCCESS)
> 0x8000: stop_tick (SUCCESS)
>
> Fixing USB-autosuspend and disabling the fbcon cursor, I'm down to
> ~0.9 wakeups-per-second booting with "nosmp".
>
> diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
> index f992762..6ba7bae 100644
> --- a/kernel/time/tick-sched.c
> +++ b/kernel/time/tick-sched.c
> @@ -210,19 +210,35 @@ void tick_nohz_stop_sched_tick(int inidle)
> struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
> u64 time_delta;
> int cpu;
> + static int testrun_counter = 0;
> + int testrun = 0;
> + u64 debugdata = 0;
> +
> + testrun_counter++;
> +
> + if ((testrun_counter % 123) == 0)
> + testrun = 1;
>
> local_irq_save(flags);
>
> cpu = smp_processor_id();
> ts = &per_cpu(tick_cpu_sched, cpu);
>
> + debugdata = cpu;
> +
> + if (inidle)
> + debugdata |= 0x2;
> +
> +
> /*
> * Call to tick_nohz_start_idle stops the last_update_time from being
> * updated. Thus, it must not be called in the event we are called from
> * irq_exit() with the prior state different than idle.
> */
> - if (!inidle && !ts->inidle)
> + if (!inidle && !ts->inidle) {
> + debugdata |= 0x4;
> goto end;
> + }
>
> /*
> * Set ts->inidle unconditionally. Even if the system did not
> @@ -245,11 +261,15 @@ void tick_nohz_stop_sched_tick(int inidle)
> tick_do_timer_cpu = TICK_DO_TIMER_NONE;
> }
>
> - if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
> + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) {
> + debugdata |= 0x8;
> goto end;
> + }
>
> - if (need_resched())
> + if (need_resched()) {
> + debugdata |= 0x10;
> goto end;
> + }
>
> if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
> static int ratelimit;
> @@ -259,6 +279,7 @@ void tick_nohz_stop_sched_tick(int inidle)
> (unsigned int) local_softirq_pending());
> ratelimit++;
> }
> + debugdata |= 0x20;
> goto end;
> }
>
> @@ -275,6 +296,12 @@ void tick_nohz_stop_sched_tick(int inidle)
> arch_needs_cpu(cpu)) {
> next_jiffies = last_jiffies + 1;
> delta_jiffies = 1;
> + if (rcu_needs_cpu(cpu))
> + debugdata |= 0x40;
> + if (printk_needs_cpu(cpu))
> + debugdata |= 0x80;
> + if (arch_needs_cpu(cpu))
> + debugdata |= 0x100;
> } else {
> /* Get the next timer wheel timer */
> next_jiffies = get_next_timer_interrupt(last_jiffies);
> @@ -284,8 +311,10 @@ void tick_nohz_stop_sched_tick(int inidle)
> * Do not stop the tick, if we are only one off
> * or if the cpu is required for rcu
> */
> - if (!ts->tick_stopped && delta_jiffies == 1)
> + if (!ts->tick_stopped && delta_jiffies == 1) {
> + debugdata |= 0x200;
> goto out;
> + }
>
> /* Schedule the tick, if we are at least one jiffie off */
> if ((long)delta_jiffies >= 1) {
> @@ -341,8 +370,10 @@ void tick_nohz_stop_sched_tick(int inidle)
> cpumask_set_cpu(cpu, nohz_cpu_mask);
>
> /* Skip reprogram of event if its not changed */
> - if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
> + if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) {
> + debugdata |= 0x400;
> goto out;
> + }
>
> /*
> * nohz_stop_sched_tick can be called several times before
> @@ -357,8 +388,10 @@ void tick_nohz_stop_sched_tick(int inidle)
> * sched tick not stopped!
> */
> cpumask_clear_cpu(cpu, nohz_cpu_mask);
> + debugdata |= 0x800;
> goto out;
> }
> + debugdata |= 0x8000;
>
> ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
> ts->tick_stopped = 1;
> @@ -375,7 +408,8 @@ void tick_nohz_stop_sched_tick(int inidle)
> * If the expiration time == KTIME_MAX, then
> * in this case we simply stop the tick timer.
> */
> - if (unlikely(expires.tv64 == KTIME_MAX)) {
> + if (unlikely(expires.tv64 == KTIME_MAX)) {
> + debugdata |= 0x1000;
> if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
> hrtimer_cancel(&ts->sched_timer);
> goto out;
> @@ -384,9 +418,13 @@ void tick_nohz_stop_sched_tick(int inidle)
> if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
> hrtimer_start(&ts->sched_timer, expires,
> HRTIMER_MODE_ABS_PINNED);
> + debugdata |= 0x2000;
> /* Check, if the timer was already in the past */
> - if (hrtimer_active(&ts->sched_timer))
> + if (hrtimer_active(&ts->sched_timer)) {
> + debugdata &= ~(0x2000);
> + debugdata |= 0x4000;
> goto out;
> + }
> } else if (!tick_program_event(expires, 0))
> goto out;
> /*
> @@ -403,6 +441,8 @@ out:
> ts->last_jiffies = last_jiffies;
> ts->sleep_length = ktime_sub(dev->next_event, now);
> end:
> + if (testrun)
> + printk(KERN_DEBUG "0x%0llx\n", debugdata);
> local_irq_restore(flags);
> }
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/