From: Dan Williams on
Hi,

We (Intel software raid devs) are seeing a problem with the detection of
partitions on md devices. The trace below shows that the block device
inode is dropped in-between when the array comes up and when it is first
opened (timestamp 655.498875). When this happens bdev->bd_invalidated
is cleared by bdget() and we never detect partitions. (Seen on a 2.6.32
based kernel, but I presume it is still present)

# tracer: nop
#
# TASK-PID CPU# TIMESTAMP FUNCTION
# | | | | |
<...>-1114 [000] 655.488730: check_disk_size_change: ffff8800374d3780: (md1) check_disk_size_change:1103 open: 1 invalidated: 0
<...>-1114 [000] 655.497906: check_disk_size_change: ffff8800374d3780: (md1) check_disk_size_change:1112 open: 1 invalidated: 0
<...>-1114 [000] 655.497908: flush_disk: ffff8800374d3780: (md1) flush_disk:1084 open: 1 invalidated: 1
<...>-1114 [000] 655.497909: flush_disk: ffff8800374d3780: (md1) flush_disk:1086 open: 1 invalidated: 1
<...>-1117 [003] 655.498875: bdget: ffff8800375aec40: () bdget:595 open: 0 invalidated: 0
<...>-1117 [003] 655.498878: __blkdev_get: ffff8800375aec40: (md1) __blkdev_get:1229 open: 0 invalidated: 0
<...>-1117 [003] 655.498879: __blkdev_get: ffff8800375aec40: (md1) __blkdev_get:1233 open: 0 invalidated: 0
<...>-1117 [003] 655.498880: __blkdev_get: ffff8800375aec40: (md1) __blkdev_get:1239 open: 0 invalidated: 0
<...>-1117 [003] 655.498882: __blkdev_get: ffff8800375aec40: (md1) __blkdev_get:1307 open: 1 invalidated: 0

These traces generated by:

#define dbg(bdev) ({ if (debug_partitions) {\
char name[BDEVNAME_SIZE] = "";\
if (bdev->bd_disk)\
disk_name(bdev->bd_disk, 0, name);\
trace_printk("%p: (%s) %s:%d open: %d invalidated: %d\n", bdev, name, __func__, __LINE__, bdev->bd_openers, bdev->bd_invalidated); \
} 0; })

The patch below (2.6.32 based) moves the block_device bd_invalidated
field to a gendisk flag, as it seems this info wants a longer lifetime.

Thoughts on this fix? Maybe it wants to be a standalone integer flag so
we don't need to add locking to nbd.

Thanks,
Dan

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index cc923a5..de8a4a4 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -607,8 +607,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
if (S_ISSOCK(inode->i_mode)) {
lo->file = file;
lo->sock = SOCKET_I(inode);
+ mutex_lock(&bdev->bd_mutex);
if (max_part > 0)
- bdev->bd_invalidated = 1;
+ bdev->bd_disk->flags |= GENHD_FL_INVALIDATED;
+ mutex_unlock(&bdev->bd_mutex);
return 0;
} else {
fput(file);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6dcee88..147f449 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -571,7 +571,6 @@ struct block_device *bdget(dev_t dev)
bdev->bd_inode = inode;
bdev->bd_block_size = (1 << inode->i_blkbits);
bdev->bd_part_count = 0;
- bdev->bd_invalidated = 0;
inode->i_mode = S_IFBLK;
inode->i_rdev = dev;
inode->i_bdev = bdev;
@@ -1069,7 +1068,7 @@ static void flush_disk(struct block_device *bdev)
if (!bdev->bd_disk)
return;
if (disk_partitionable(bdev->bd_disk))
- bdev->bd_invalidated = 1;
+ bdev->bd_disk->flags |= GENHD_FL_INVALIDATED;
}

/**
@@ -1243,7 +1242,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdi = &default_backing_dev_info;
bdev->bd_inode->i_data.backing_dev_info = bdi;
}
- if (bdev->bd_invalidated)
+ if (bdev->bd_disk->flags & GENHD_FL_INVALIDATED)
rescan_partitions(disk, bdev);
} else {
struct block_device *whole;
@@ -1276,7 +1275,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (ret)
goto out_unlock_bdev;
}
- if (bdev->bd_invalidated)
+ if (bdev->bd_disk->flags & GENHD_FL_INVALIDATED)
rescan_partitions(bdev->bd_disk, bdev);
}
}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e8865c1..7872269 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -519,7 +519,7 @@ void register_disk(struct gendisk *disk)
if (!bdev)
goto exit;

- bdev->bd_invalidated = 1;
+ disk->flags |= GENHD_FL_INVALIDATED;
err = blkdev_get(bdev, FMODE_READ);
if (err < 0)
goto exit;
@@ -558,7 +558,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
if (disk->fops->revalidate_disk)
disk->fops->revalidate_disk(disk);
check_disk_size_change(disk, bdev);
- bdev->bd_invalidated = 0;
+ bdev->bd_disk->flags &= ~GENHD_FL_INVALIDATED;
if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
return 0;
if (IS_ERR(state)) /* I/O error reading the partition table */
@@ -609,7 +609,7 @@ try_scan:
if (capacity > get_capacity(disk)) {
set_capacity(disk, capacity);
check_disk_size_change(disk, bdev);
- bdev->bd_invalidated = 0;
+ bdev->bd_disk->flags &= ~GENHD_FL_INVALIDATED;
}
goto try_scan;
} else {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1e58fc8..367875a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -661,7 +661,6 @@ struct block_device {
struct hd_struct * bd_part;
/* number of times partitions within this device have been opened. */
unsigned bd_part_count;
- int bd_invalidated;
struct gendisk * bd_disk;
struct list_head bd_list;
/*
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index c6c0c41..d97cdec 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -116,6 +116,7 @@ struct hd_struct {
#define GENHD_FL_SUPPRESS_PARTITION_INFO 32
#define GENHD_FL_EXT_DEVT 64 /* allow extended devt */
#define GENHD_FL_NATIVE_CAPACITY 128
+#define GENHD_FL_INVALIDATED 256

#define BLK_SCSI_MAX_CMDS (256)
#define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
From: Dan Williams on
On Wed, Jul 21, 2010 at 12:52 PM, Dan Williams <dan.j.williams(a)intel.com> wrote:
> Hi,
>
> We (Intel software raid devs) are seeing a problem with the detection of
> partitions on md devices. �The trace below shows that the block device
> inode is dropped in-between when the array comes up and when it is first
> opened (timestamp 655.498875). �When this happens bdev->bd_invalidated
> is cleared by bdget() and we never detect partitions. �(Seen on a 2.6.32
> based kernel, but I presume it is still present)

The kernel in question was missing commit f3b99be1 "Restore partition
detection of newly created md arrays" [1]. With that in place the
problem goes away. I haven't fully convinced myself that there isn't
a small race that could be triggered by changing the array size via
sysfs, to get the the gendisk size updated and then dropping all inode
references prior to the first open... but I'll put that investigation
on the back burner.

--
Dan

[1]: http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=f3b99be1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo(a)vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/