From: Brandon Philips on
Race in create_irq_nr():

- Thread 1 loops through and calls irq_to_desc_alloc_node with new=0x66.

- Thread 2 has exited the loop with irq=0x66 and calls dynamic_irq_init(0x66)
setting desc->chip_data = NULL

- Thread 1 then dereferences NULL via desc_new->chip_data->vector

Fix by moving holding vector_lock until after the dynamic_irq_init().

BUG: unable to handle kernel NULL pointer dereference at 0000000000000088
IP: [<ffffffff8101df32>] create_irq_nr+0x62/0x100
PGD 23dc24067 PUD 23dc72067 PMD 0
Oops: 0000 [#1] SMP
last sysfs file: /sys/devices/pci0000:00/0000:00:1c.0/0000:08:00.0/net/eth2/type
CPU 12
Modules linked in: i2c_i801 igb(+) iTCO_wdt ixgbe(+) ioatdma(+) e1000e mptctl mdio usb_storage iTCO_vendor_support dca ses button sg pcspkr enclosure container ac usbhid uhci_hcd ehci_hcd usbcore sd_mod edd fan processor ide_pci_generic ide_core ata_generic ata_piix libata lpfc scsi_transport_fc scsi_tgt mptsas mptscsih mptbase scsi_transport_sas megaraid_sas scsi_mod thermal thermal_sys
Supported: Yes
Pid: 1684, comm: modprobe Not tainted 2.6.32.3-0.3-default #1 PRIMERGY RX300 S5
RIP: 0010:[<ffffffff8101df32>] [<ffffffff8101df32>] create_irq_nr+0x62/0x100
RSP: 0018:ffff88013ce0fc18 EFLAGS: 00010086
RAX: ffff88023e11ee00 RBX: 0000000000000066 RCX: 00000000000000c2
RDX: 00000000000000c2 RSI: 00000000ffffffff RDI: 0000000000000066
RBP: 0000000000000000 R08: ffffffff81767a85 R09: 000000000000000a
R10: 0000000000000000 R11: 0000000000000000 R12: 00000000ffffffff
R13: 0000000000000206 R14: ffff88013f381000 R15: 0000000000000080
FS: 00007f16d181e700(0000) GS:ffff880143d00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000000000000088 CR3: 000000023d26c000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process modprobe (pid: 1684, threadinfo ffff88013ce0e000, task ffff88013d080340)
Stack:
0000000000000001 0000000000000000 ffff88023d2d8740 0000000000000064
<0> 0000000000000007 ffffffff8101f2ce 0000000900000009 ffff88013f381810
<0> ffffffff3f381000 0000000000000048 0000000000000009 ffff88013f381000
Call Trace:
[<ffffffff8101f2ce>] arch_setup_msi_irqs+0xce/0x190
[<ffffffff812574b9>] msix_capability_init+0x189/0x2f0
[<ffffffffa032b4a4>] igb_set_interrupt_capability+0xe4/0x1e0 [igb]
[<ffffffffa033634e>] igb_probe+0x3de/0xd15 [igb]
[<ffffffff8124d212>] local_pci_probe+0x12/0x20
[<ffffffff8124d4c0>] __pci_device_probe+0xe0/0xf0
[<ffffffff8124e3d3>] pci_device_probe+0x33/0x60
[<ffffffff812e72f7>] really_probe+0x77/0x230
[<ffffffff812e751a>] driver_probe_device+0x6a/0xc0
[<ffffffff812e7603>] __driver_attach+0x93/0xa0
[<ffffffff812e6928>] bus_for_each_dev+0x58/0x80
[<ffffffff812e6115>] bus_add_driver+0x195/0x2f0
[<ffffffff812e7919>] driver_register+0x79/0x170
[<ffffffff8124e648>] __pci_register_driver+0x58/0xe0
[<ffffffff810001e5>] do_one_initcall+0x35/0x190
[<ffffffff8108af34>] sys_init_module+0xe4/0x270
[<ffffffff81002f7b>] system_call_fastpath+0x16/0x1b
[<00007f16d13b234a>] 0x7f16d13b234a
Code: 2e 0f 1f 84 00 00 00 00 00 83 c3 01 39 1d e7 e2 9f 00 76 7d 44 89 e6 89 df e8 2b 2a 3d 00 48 85 c0 0f 84 8a 00 00 00 48 8b 68 40 <80> bd 88 00 00 00 00 75 d5 44 89 e6 48 89 c7 e8 6a 5c 09 00 49
RIP [<ffffffff8101df32>] create_irq_nr+0x62/0x100
RSP <ffff88013ce0fc18>
CR2: 0000000000000088

Signed-off-by: Brandon Philips <bphilips(a)suse.de>

---
arch/x86/kernel/apic/io_apic.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

Index: linux-2.6.32-SLE11-SP1/arch/x86/kernel/apic/io_apic.c
===================================================================
--- linux-2.6.32-SLE11-SP1.orig/arch/x86/kernel/apic/io_apic.c
+++ linux-2.6.32-SLE11-SP1/arch/x86/kernel/apic/io_apic.c
@@ -3212,7 +3212,6 @@ unsigned int create_irq_nr(unsigned int
irq = new;
break;
}
- spin_unlock_irqrestore(&vector_lock, flags);

if (irq > 0) {
dynamic_irq_init(irq);
@@ -3220,6 +3219,8 @@ unsigned int create_irq_nr(unsigned int
if (desc_new)
desc_new->chip_data = cfg_new;
}
+ spin_unlock_irqrestore(&vector_lock, flags);
+
return irq;
}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Yinghai Lu on
On 02/02/2010 07:31 PM, Brandon Philips wrote:
> Race in create_irq_nr():
>
> - Thread 1 loops through and calls irq_to_desc_alloc_node with new=0x66.
>
> - Thread 2 has exited the loop with irq=0x66 and calls dynamic_irq_init(0x66)
> setting desc->chip_data = NULL
>
> - Thread 1 then dereferences NULL via desc_new->chip_data->vector

two threads get same irq?

YH
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Yinghai Lu on
On 02/02/2010 07:31 PM, Brandon Philips wrote:
> Race in create_irq_nr():
>
> - Thread 1 loops through and calls irq_to_desc_alloc_node with new=0x66.
>
> - Thread 2 has exited the loop with irq=0x66 and calls dynamic_irq_init(0x66)
> setting desc->chip_data = NULL
>
> - Thread 1 then dereferences NULL via desc_new->chip_data->vector
>
> Fix by moving holding vector_lock until after the dynamic_irq_init().
>
>
> Index: linux-2.6.32-SLE11-SP1/arch/x86/kernel/apic/io_apic.c
> ===================================================================
> --- linux-2.6.32-SLE11-SP1.orig/arch/x86/kernel/apic/io_apic.c
> +++ linux-2.6.32-SLE11-SP1/arch/x86/kernel/apic/io_apic.c

can you check if

http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=37ef2a3029fde884808ff1b369677abc7dd9a79a

fix your problem with 2.6.32?

From 37ef2a3029fde884808ff1b369677abc7dd9a79a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai(a)kernel.org>
Date: Sat, 21 Nov 2009 00:23:37 -0800
Subject: [PATCH] x86: Re-get cfg_new in case reuse/move irq_desc

When irq_desc is moved, we need to make sure to use the right cfg_new.

Signed-off-by: Yinghai Lu <yinghai(a)kernel.org>
LKML-Reference: <4B07A739.3030104(a)kernel.org>
Signed-off-by: Ingo Molnar <mingo(a)elte.hu>
---
arch/x86/kernel/apic/io_apic.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ff23719..085e60e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3186,6 +3186,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
continue;

desc_new = move_irq_desc(desc_new, node);
+ cfg_new = desc_new->chip_data;

if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
irq = new;
--
1.6.6.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Brandon Philips on
On 02:20 Wed 03 Feb 2010, Yinghai Lu wrote:
> On 02/02/2010 07:31 PM, Brandon Philips wrote:
> > Race in create_irq_nr():
> >
> > - Thread 1 loops through and calls irq_to_desc_alloc_node with new=0x66.
> >
> > - Thread 2 has exited the loop with irq=0x66 and calls dynamic_irq_init(0x66)
> > setting desc->chip_data = NULL
> >
> > - Thread 1 then dereferences NULL via desc_new->chip_data->vector
>
> two threads get same irq?

This race happened when two drivers were setting up MSI-X at the same
time via pci_enable_msix(). See this dmesg excerpt:

[ 85.170610] ixgbe 0000:02:00.1: irq 97 for MSI/MSI-X
[ 85.170611] alloc irq_desc for 99 on node -1
[ 85.170613] igb 0000:08:00.1: irq 98 for MSI/MSI-X
[ 85.170614] alloc kstat_irqs on node -1
[ 85.170616] alloc irq_2_iommu on node -1
[ 85.170617] alloc irq_desc for 100 on node -1
[ 85.170619] alloc kstat_irqs on node -1
[ 85.170621] alloc irq_2_iommu on node -1
[ 85.170625] ixgbe 0000:02:00.1: irq 99 for MSI/MSI-X
[ 85.170626] alloc irq_desc for 101 on node -1
[ 85.170628] igb 0000:08:00.1: irq 100 for MSI/MSI-X
[ 85.170630] alloc kstat_irqs on node -1
[ 85.170631] alloc irq_2_iommu on node -1
[ 85.170635] alloc irq_desc for 102 on node -1
[ 85.170636] alloc kstat_irqs on node -1
[ 85.170639] alloc irq_2_iommu on node -1
[ 85.170646] BUG: unable to handle kernel NULL pointer dereference
at 0000000000000088

As you can see igb and ixgbe are both alternating on create_irq_nr()
via pci_enable_msix() in their probe function. So, let me rewrite my
explanation using this example:

ixgbe: While looping through irq_desc_ptrs[] via create_irq_nr() ixgbe
choses irq_desc_ptrs[102] and exits the loop, drops vector_lock and
calls dynamic_irq_init. Then it sets irq_desc_ptrs[102]->chip_data =
NULL via dynamic_irq_init().

igb: Grabs the vector_lock now and starts looping over irq_desc_ptrs[]
via create_irq_nr(). It gets to irq_desc_ptrs[102] and does this:

cfg_new = irq_desc_ptrs[102]->chip_data;
if (cfg_new->vector != 0)
continue;

This hits the NULL deref.

Does that make sense? It is sort of a rare thing to reproduce- took
40+ reboots.

Thanks,

Brandon
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Yinghai Lu on
On 02/03/2010 09:42 AM, Brandon Philips wrote:
> On 02:20 Wed 03 Feb 2010, Yinghai Lu wrote:
>> On 02/02/2010 07:31 PM, Brandon Philips wrote:
>>> Race in create_irq_nr():
>>>
>>> - Thread 1 loops through and calls irq_to_desc_alloc_node with new=0x66.
>>>
>>> - Thread 2 has exited the loop with irq=0x66 and calls dynamic_irq_init(0x66)
>>> setting desc->chip_data = NULL
>>>
>>> - Thread 1 then dereferences NULL via desc_new->chip_data->vector
>>
>> two threads get same irq?
>
> This race happened when two drivers were setting up MSI-X at the same
> time via pci_enable_msix(). See this dmesg excerpt:
>
> [ 85.170610] ixgbe 0000:02:00.1: irq 97 for MSI/MSI-X
> [ 85.170611] alloc irq_desc for 99 on node -1
> [ 85.170613] igb 0000:08:00.1: irq 98 for MSI/MSI-X
> [ 85.170614] alloc kstat_irqs on node -1
> [ 85.170616] alloc irq_2_iommu on node -1
> [ 85.170617] alloc irq_desc for 100 on node -1
> [ 85.170619] alloc kstat_irqs on node -1
> [ 85.170621] alloc irq_2_iommu on node -1
> [ 85.170625] ixgbe 0000:02:00.1: irq 99 for MSI/MSI-X
> [ 85.170626] alloc irq_desc for 101 on node -1
> [ 85.170628] igb 0000:08:00.1: irq 100 for MSI/MSI-X
> [ 85.170630] alloc kstat_irqs on node -1
> [ 85.170631] alloc irq_2_iommu on node -1
> [ 85.170635] alloc irq_desc for 102 on node -1
> [ 85.170636] alloc kstat_irqs on node -1
> [ 85.170639] alloc irq_2_iommu on node -1
> [ 85.170646] BUG: unable to handle kernel NULL pointer dereference
> at 0000000000000088
>
> As you can see igb and ixgbe are both alternating on create_irq_nr()
> via pci_enable_msix() in their probe function. So, let me rewrite my
> explanation using this example:
>
> ixgbe: While looping through irq_desc_ptrs[] via create_irq_nr() ixgbe
> choses irq_desc_ptrs[102] and exits the loop, drops vector_lock and
> calls dynamic_irq_init. Then it sets irq_desc_ptrs[102]->chip_data =
> NULL via dynamic_irq_init().
>
> igb: Grabs the vector_lock now and starts looping over irq_desc_ptrs[]
> via create_irq_nr(). It gets to irq_desc_ptrs[102] and does this:
>
> cfg_new = irq_desc_ptrs[102]->chip_data;
> if (cfg_new->vector != 0)
> continue;
>
> This hits the NULL deref.
>

please try following patch in addition to

http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=37ef2a3029fde884808ff1b369677abc7dd9a79a

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7edafc7..14099ba 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3280,12 +3280,9 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
}
spin_unlock_irqrestore(&vector_lock, flags);

- if (irq > 0) {
- dynamic_irq_init(irq);
- /* restore it, in case dynamic_irq_init clear it */
- if (desc_new)
- desc_new->chip_data = cfg_new;
- }
+ if (irq > 0)
+ dynamic_irq_init_keep_chip_data(irq);
+
return irq;
}

diff --git a/include/linux/irq.h b/include/linux/irq.h
index d13492d..cd6b870 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -400,6 +400,7 @@ static inline int irq_has_action(unsigned int irq)

/* Dynamic irq helper functions */
extern void dynamic_irq_init(unsigned int irq);
+void dynamic_irq_init_keep_chip_data(unsigned int irq);
extern void dynamic_irq_cleanup(unsigned int irq);

/* Set/get chip/data for an IRQ: */
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ecc3fa2..370dbc4 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -22,7 +22,7 @@
* dynamic_irq_init - initialize a dynamically allocated irq
* @irq: irq number to initialize
*/
-void dynamic_irq_init(unsigned int irq)
+static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
{
struct irq_desc *desc;
unsigned long flags;
@@ -41,7 +41,8 @@ void dynamic_irq_init(unsigned int irq)
desc->depth = 1;
desc->msi_desc = NULL;
desc->handler_data = NULL;
- desc->chip_data = NULL;
+ if (!keep_chip_data)
+ desc->chip_data = NULL;
desc->action = NULL;
desc->irq_count = 0;
desc->irqs_unhandled = 0;
@@ -54,6 +55,16 @@ void dynamic_irq_init(unsigned int irq)
raw_spin_unlock_irqrestore(&desc->lock, flags);
}

+void dynamic_irq_init(unsigned int irq)
+{
+ dynamic_irq_init_x(irq, false);
+}
+
+void dynamic_irq_init_keep_chip_data(unsigned int irq)
+{
+ dynamic_irq_init_x(irq, true);
+}
+
/**
* dynamic_irq_cleanup - cleanup a dynamically allocated irq
* @irq: irq number to initialize
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/