From: Yinghai Lu on
On 02/05/2010 01:09 PM, Brandon Philips wrote:
> When two drivers are setting up MSI-X at the same time via
> pci_enable_msix() there is a race. See this dmesg excerpt:
>
> [ 85.170610] ixgbe 0000:02:00.1: irq 97 for MSI/MSI-X
> [ 85.170611] alloc irq_desc for 99 on node -1
> [ 85.170613] igb 0000:08:00.1: irq 98 for MSI/MSI-X
> [ 85.170614] alloc kstat_irqs on node -1
> [ 85.170616] alloc irq_2_iommu on node -1
> [ 85.170617] alloc irq_desc for 100 on node -1
> [ 85.170619] alloc kstat_irqs on node -1
> [ 85.170621] alloc irq_2_iommu on node -1
> [ 85.170625] ixgbe 0000:02:00.1: irq 99 for MSI/MSI-X
> [ 85.170626] alloc irq_desc for 101 on node -1
> [ 85.170628] igb 0000:08:00.1: irq 100 for MSI/MSI-X
> [ 85.170630] alloc kstat_irqs on node -1
> [ 85.170631] alloc irq_2_iommu on node -1
> [ 85.170635] alloc irq_desc for 102 on node -1
> [ 85.170636] alloc kstat_irqs on node -1
> [ 85.170639] alloc irq_2_iommu on node -1
> [ 85.170646] BUG: unable to handle kernel NULL pointer dereference
> at 0000000000000088
>
> As you can see igb and ixgbe are both alternating on create_irq_nr()
> via pci_enable_msix() in their probe function.
>
> ixgbe: While looping through irq_desc_ptrs[] via create_irq_nr() ixgbe
> choses irq_desc_ptrs[102] and exits the loop, drops vector_lock and
> calls dynamic_irq_init. Then it sets irq_desc_ptrs[102]->chip_data =
> NULL via dynamic_irq_init().
>
> igb: Grabs the vector_lock now and starts looping over irq_desc_ptrs[]
> via create_irq_nr(). It gets to irq_desc_ptrs[102] and does this:
>
> cfg_new = irq_desc_ptrs[102]->chip_data;
> if (cfg_new->vector != 0)
> continue;
>
> This hits the NULL deref.
>
> Another possible race exists via pci_disable_msix() in a driver or in
> the number of error paths that call free_msi_irqs():
>
> destroy_irq()
> dynamic_irq_cleanup() which sets desc->chip_data = NULL
> ...race window...
> desc->chip_data = cfg;
>
> Remove the save and restore code for cfg in create_irq_nr() and
> destroy_irq() and take the desc->lock when checking the irq_cfg.
>
> Reported-and-analyzed-by: Brandon Philips <bphilips(a)suse.de>
> Signed-off-by: Yinghai Lu <yinghai(a)kernel.org>
> Signed-off-by: Brandon Phiilps <bphilips(a)suse.de>
> Cc: stable(a)kernel.org
>
> ---
> arch/x86/kernel/apic/io_apic.c | 14 +++--------
> include/linux/irq.h | 2 +
> kernel/irq/chip.c | 52 +++++++++++++++++++++++++++++++++--------
> 3 files changed, 49 insertions(+), 19 deletions(-)
>
> Index: linux-2.6/arch/x86/kernel/apic/io_apic.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/kernel/apic/io_apic.c
> +++ linux-2.6/arch/x86/kernel/apic/io_apic.c
> @@ -3228,12 +3228,9 @@ unsigned int create_irq_nr(unsigned int
> }
> spin_unlock_irqrestore(&vector_lock, flags);
>
> - if (irq > 0) {
> - dynamic_irq_init(irq);
> - /* restore it, in case dynamic_irq_init clear it */
> - if (desc_new)
> - desc_new->chip_data = cfg_new;
> - }
> + if (irq > 0)
> + dynamic_irq_init_keep_chip_data(irq);
> +
> return irq;
> }
>
> @@ -3260,10 +3257,7 @@ void destroy_irq(unsigned int irq)
>
> /* store it, in case dynamic_irq_cleanup clear it */
> desc = irq_to_desc(irq);
> - cfg = desc->chip_data;
> - dynamic_irq_cleanup(irq);
> - /* connect back irq_cfg */
> - desc->chip_data = cfg;
> + dynamic_irq_cleanup_keep_chip_data(irq);
>
> free_irte(irq);
> spin_lock_irqsave(&vector_lock, flags);
> Index: linux-2.6/include/linux/irq.h
> ===================================================================
> --- linux-2.6.orig/include/linux/irq.h
> +++ linux-2.6/include/linux/irq.h
> @@ -400,7 +400,9 @@ static inline int irq_has_action(unsigne
>
> /* Dynamic irq helper functions */
> extern void dynamic_irq_init(unsigned int irq);
> +extern void dynamic_irq_init_keep_chip_data(unsigned int irq);
> extern void dynamic_irq_cleanup(unsigned int irq);
> +extern void dynamic_irq_cleanup_keep_chip_data(unsigned int irq);
>
> /* Set/get chip/data for an IRQ: */
> extern int set_irq_chip(unsigned int irq, struct irq_chip *chip);
> Index: linux-2.6/kernel/irq/chip.c
> ===================================================================
> --- linux-2.6.orig/kernel/irq/chip.c
> +++ linux-2.6/kernel/irq/chip.c
> @@ -18,11 +18,7 @@
>
> #include "internals.h"
>
> -/**
> - * dynamic_irq_init - initialize a dynamically allocated irq
> - * @irq: irq number to initialize
> - */
> -void dynamic_irq_init(unsigned int irq)
> +static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
> {
> struct irq_desc *desc;
> unsigned long flags;
> @@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq)
> desc->depth = 1;
> desc->msi_desc = NULL;
> desc->handler_data = NULL;
> - desc->chip_data = NULL;
> + if (!keep_chip_data)
> + desc->chip_data = NULL;
> desc->action = NULL;
> desc->irq_count = 0;
> desc->irqs_unhandled = 0;
> @@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq)
> }
>
> /**
> - * dynamic_irq_cleanup - cleanup a dynamically allocated irq
> + * dynamic_irq_init - initialize a dynamically allocated irq
> * @irq: irq number to initialize
> */
> -void dynamic_irq_cleanup(unsigned int irq)
> +void dynamic_irq_init(unsigned int irq)
> +{
> + dynamic_irq_init_x(irq, false);
> +}
> +
> +/**
> + * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
> + * @irq: irq number to initialize
> + *
> + * does not set irq_to_desc(irq)->chip_data to NULL
> + */
> +void dynamic_irq_init_keep_chip_data(unsigned int irq)
> +{
> + dynamic_irq_init_x(irq, true);
> +}
> +
> +static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
> {
> struct irq_desc *desc = irq_to_desc(irq);
> unsigned long flags;
> @@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int ir
> }
> desc->msi_desc = NULL;
> desc->handler_data = NULL;
> - desc->chip_data = NULL;
> + if (!keep_chip_data)
> + desc->chip_data = NULL;
> desc->handle_irq = handle_bad_irq;
> desc->chip = &no_irq_chip;
> desc->name = NULL;
> @@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int ir
> raw_spin_unlock_irqrestore(&desc->lock, flags);
> }
>
> +/**
> + * dynamic_irq_cleanup - cleanup a dynamically allocated irq
> + * @irq: irq number to initialize
> + */
> +void dynamic_irq_cleanup(unsigned int irq)
> +{
> + dynamic_irq_init_x(irq, false);

should be dynamic_irq_cleanup_x here.



> +}
> +
> +/**
> + * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
> + * @irq: irq number to initialize
> + *
> + * does not set irq_to_desc(irq)->chip_data to NULL
> + */
> +void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
> +{
> + dynamic_irq_init_x(irq, true);

should be dynamic_irq_cleanup_x

> +}
> +
>
> /**
> * set_irq_chip - set the irq chip for an irq

YH
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Brandon Philips on
On 14:44 Fri 05 Feb 2010, Yinghai Lu wrote:
> On 02/05/2010 01:09 PM, Brandon Philips wrote:
> > @@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int ir
> > }
> > desc->msi_desc = NULL;
> > desc->handler_data = NULL;
> > - desc->chip_data = NULL;
> > + if (!keep_chip_data)
> > + desc->chip_data = NULL;
> > desc->handle_irq = handle_bad_irq;
> > desc->chip = &no_irq_chip;
> > desc->name = NULL;
> > @@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int ir
> > raw_spin_unlock_irqrestore(&desc->lock, flags);
> > }
> >
> > +/**
> > + * dynamic_irq_cleanup - cleanup a dynamically allocated irq
> > + * @irq: irq number to initialize
> > + */
> > +void dynamic_irq_cleanup(unsigned int irq)
> > +{
> > + dynamic_irq_init_x(irq, false);
>
> should be dynamic_irq_cleanup_x here.
>
> > +}
> > +
> > +/**
> > + * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
> > + * @irq: irq number to initialize
> > + *
> > + * does not set irq_to_desc(irq)->chip_data to NULL
> > + */
> > +void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
> > +{
> > + dynamic_irq_init_x(irq, true);
>
> should be dynamic_irq_cleanup_x

Oops, right. I will fix this up along with the externs as hpa
suggested and send again.

What are your thoughts on locking? Does it look OK as is?

Thanks,

Brandon
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Yinghai Lu on
On 02/05/2010 02:55 PM, Brandon Philips wrote:
> On 14:44 Fri 05 Feb 2010, Yinghai Lu wrote:
>> On 02/05/2010 01:09 PM, Brandon Philips wrote:
>>> @@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int ir
>>> }
>>> desc->msi_desc = NULL;
>>> desc->handler_data = NULL;
>>> - desc->chip_data = NULL;
>>> + if (!keep_chip_data)
>>> + desc->chip_data = NULL;
>>> desc->handle_irq = handle_bad_irq;
>>> desc->chip = &no_irq_chip;
>>> desc->name = NULL;
>>> @@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int ir
>>> raw_spin_unlock_irqrestore(&desc->lock, flags);
>>> }
>>>
>>> +/**
>>> + * dynamic_irq_cleanup - cleanup a dynamically allocated irq
>>> + * @irq: irq number to initialize
>>> + */
>>> +void dynamic_irq_cleanup(unsigned int irq)
>>> +{
>>> + dynamic_irq_init_x(irq, false);
>>
>> should be dynamic_irq_cleanup_x here.
>>
>>> +}
>>> +
>>> +/**
>>> + * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
>>> + * @irq: irq number to initialize
>>> + *
>>> + * does not set irq_to_desc(irq)->chip_data to NULL
>>> + */
>>> +void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
>>> +{
>>> + dynamic_irq_init_x(irq, true);
>>
>> should be dynamic_irq_cleanup_x
>
> Oops, right. I will fix this up along with the externs as hpa
> suggested and send again.

>
> What are your thoughts on locking? Does it look OK as is?
>

ok to me.

YH
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Brandon Philips on
Version 3: Forgot to refresh patch so destroy_irq() had uninitialized
cfg as param to __clear_irq_vector(). Fixed now.

When two drivers are setting up MSI-X at the same time via
pci_enable_msix() there is a race. See this dmesg excerpt:

[ 85.170610] ixgbe 0000:02:00.1: irq 97 for MSI/MSI-X
[ 85.170611] alloc irq_desc for 99 on node -1
[ 85.170613] igb 0000:08:00.1: irq 98 for MSI/MSI-X
[ 85.170614] alloc kstat_irqs on node -1
[ 85.170616] alloc irq_2_iommu on node -1
[ 85.170617] alloc irq_desc for 100 on node -1
[ 85.170619] alloc kstat_irqs on node -1
[ 85.170621] alloc irq_2_iommu on node -1
[ 85.170625] ixgbe 0000:02:00.1: irq 99 for MSI/MSI-X
[ 85.170626] alloc irq_desc for 101 on node -1
[ 85.170628] igb 0000:08:00.1: irq 100 for MSI/MSI-X
[ 85.170630] alloc kstat_irqs on node -1
[ 85.170631] alloc irq_2_iommu on node -1
[ 85.170635] alloc irq_desc for 102 on node -1
[ 85.170636] alloc kstat_irqs on node -1
[ 85.170639] alloc irq_2_iommu on node -1
[ 85.170646] BUG: unable to handle kernel NULL pointer dereference
at 0000000000000088

As you can see igb and ixgbe are both alternating on create_irq_nr()
via pci_enable_msix() in their probe function.

ixgbe: While looping through irq_desc_ptrs[] via create_irq_nr() ixgbe
choses irq_desc_ptrs[102] and exits the loop, drops vector_lock and
calls dynamic_irq_init. Then it sets irq_desc_ptrs[102]->chip_data =
NULL via dynamic_irq_init().

igb: Grabs the vector_lock now and starts looping over irq_desc_ptrs[]
via create_irq_nr(). It gets to irq_desc_ptrs[102] and does this:

cfg_new = irq_desc_ptrs[102]->chip_data;
if (cfg_new->vector != 0)
continue;

This hits the NULL deref.

Another possible race exists via pci_disable_msix() in a driver or in
the number of error paths that call free_msi_irqs():

destroy_irq()
dynamic_irq_cleanup() which sets desc->chip_data = NULL
....race window...
desc->chip_data = cfg;

Remove the save and restore code for cfg in create_irq_nr() and
destroy_irq() and take the desc->lock when checking the irq_cfg.

Reported-and-analyzed-by: Brandon Philips <bphilips(a)suse.de>
Signed-off-by: Yinghai Lu <yinghai(a)kernel.org>
Signed-off-by: Brandon Phiilps <bphilips(a)suse.de>
Cc: stable(a)kernel.org

---
arch/x86/kernel/apic/io_apic.c | 17 +++----------
include/linux/irq.h | 2 +
kernel/irq/chip.c | 52 +++++++++++++++++++++++++++++++++--------
3 files changed, 50 insertions(+), 21 deletions(-)

Index: linux-2.6/arch/x86/kernel/apic/io_apic.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/apic/io_apic.c
+++ linux-2.6/arch/x86/kernel/apic/io_apic.c
@@ -3228,12 +3228,9 @@ unsigned int create_irq_nr(unsigned int
}
spin_unlock_irqrestore(&vector_lock, flags);

- if (irq > 0) {
- dynamic_irq_init(irq);
- /* restore it, in case dynamic_irq_init clear it */
- if (desc_new)
- desc_new->chip_data = cfg_new;
- }
+ if (irq > 0)
+ dynamic_irq_init_keep_chip_data(irq);
+
return irq;
}

@@ -3255,19 +3252,15 @@ int create_irq(void)
void destroy_irq(unsigned int irq)
{
unsigned long flags;
- struct irq_cfg *cfg;
struct irq_desc *desc;

/* store it, in case dynamic_irq_cleanup clear it */
desc = irq_to_desc(irq);
- cfg = desc->chip_data;
- dynamic_irq_cleanup(irq);
- /* connect back irq_cfg */
- desc->chip_data = cfg;
+ dynamic_irq_cleanup_keep_chip_data(irq);

free_irte(irq);
spin_lock_irqsave(&vector_lock, flags);
- __clear_irq_vector(irq, cfg);
+ __clear_irq_vector(irq, desc->chip_data);
spin_unlock_irqrestore(&vector_lock, flags);
}

Index: linux-2.6/include/linux/irq.h
===================================================================
--- linux-2.6.orig/include/linux/irq.h
+++ linux-2.6/include/linux/irq.h
@@ -400,7 +400,9 @@ static inline int irq_has_action(unsigne

/* Dynamic irq helper functions */
extern void dynamic_irq_init(unsigned int irq);
+void dynamic_irq_init_keep_chip_data(unsigned int irq);
extern void dynamic_irq_cleanup(unsigned int irq);
+void dynamic_irq_cleanup_keep_chip_data(unsigned int irq);

/* Set/get chip/data for an IRQ: */
extern int set_irq_chip(unsigned int irq, struct irq_chip *chip);
Index: linux-2.6/kernel/irq/chip.c
===================================================================
--- linux-2.6.orig/kernel/irq/chip.c
+++ linux-2.6/kernel/irq/chip.c
@@ -18,11 +18,7 @@

#include "internals.h"

-/**
- * dynamic_irq_init - initialize a dynamically allocated irq
- * @irq: irq number to initialize
- */
-void dynamic_irq_init(unsigned int irq)
+static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
{
struct irq_desc *desc;
unsigned long flags;
@@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq)
desc->depth = 1;
desc->msi_desc = NULL;
desc->handler_data = NULL;
- desc->chip_data = NULL;
+ if (!keep_chip_data)
+ desc->chip_data = NULL;
desc->action = NULL;
desc->irq_count = 0;
desc->irqs_unhandled = 0;
@@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq)
}

/**
- * dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ * dynamic_irq_init - initialize a dynamically allocated irq
* @irq: irq number to initialize
*/
-void dynamic_irq_cleanup(unsigned int irq)
+void dynamic_irq_init(unsigned int irq)
+{
+ dynamic_irq_init_x(irq, false);
+}
+
+/**
+ * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
+ * @irq: irq number to initialize
+ *
+ * does not set irq_to_desc(irq)->chip_data to NULL
+ */
+void dynamic_irq_init_keep_chip_data(unsigned int irq)
+{
+ dynamic_irq_init_x(irq, true);
+}
+
+static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
@@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int ir
}
desc->msi_desc = NULL;
desc->handler_data = NULL;
- desc->chip_data = NULL;
+ if (!keep_chip_data)
+ desc->chip_data = NULL;
desc->handle_irq = handle_bad_irq;
desc->chip = &no_irq_chip;
desc->name = NULL;
@@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int ir
raw_spin_unlock_irqrestore(&desc->lock, flags);
}

+/**
+ * dynamic_irq_cleanup - cleanup a dynamically allocated irq
+ * @irq: irq number to initialize
+ */
+void dynamic_irq_cleanup(unsigned int irq)
+{
+ dynamic_irq_cleanup_x(irq, false);
+}
+
+/**
+ * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
+ * @irq: irq number to initialize
+ *
+ * does not set irq_to_desc(irq)->chip_data to NULL
+ */
+void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
+{
+ dynamic_irq_cleanup_x(irq, true);
+}
+

/**
* set_irq_chip - set the irq chip for an irq
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Yinghai Lu on
On 02/05/2010 10:42 PM, Brandon Philips wrote:
> Version 3: Forgot to refresh patch so destroy_irq() had uninitialized
> cfg as param to __clear_irq_vector(). Fixed now.

> @@ -3255,19 +3252,15 @@ int create_irq(void)
> void destroy_irq(unsigned int irq)
> {
> unsigned long flags;
> - struct irq_cfg *cfg;
> struct irq_desc *desc;
>
> /* store it, in case dynamic_irq_cleanup clear it */
> desc = irq_to_desc(irq);
> - cfg = desc->chip_data;
> - dynamic_irq_cleanup(irq);
> - /* connect back irq_cfg */
> - desc->chip_data = cfg;
> + dynamic_irq_cleanup_keep_chip_data(irq);
>
> free_irte(irq);
> spin_lock_irqsave(&vector_lock, flags);
> - __clear_irq_vector(irq, cfg);
> + __clear_irq_vector(irq, desc->chip_data);
> spin_unlock_irqrestore(&vector_lock, flags);
> }

==>
@@ -3308,17 +3305,12 @@ void destroy_irq(unsigned int irq)
{
unsigned long flags;
struct irq_cfg *cfg;
- struct irq_desc *desc;

- /* store it, in case dynamic_irq_cleanup clear it */
- desc = irq_to_desc(irq);
- cfg = desc->chip_data;
- dynamic_irq_cleanup(irq);
- /* connect back irq_cfg */
- desc->chip_data = cfg;
+ dynamic_irq_cleanup_keep_chip_data(irq);

free_irte(irq);
spin_lock_irqsave(&vector_lock, flags);
+ cfg = irq_to_desc(irq)->chip_data;
__clear_irq_vector(irq, cfg);
spin_unlock_irqrestore(&vector_lock, flags);
}


Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/