From 77304d2abac6101f7249754ffdd4421258877ab0 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 8 Nov 2010 14:39:12 +0100 Subject: block: read i_size with i_size_read() Convert direct reads of an inode's i_size to using i_size_read(). i_size_{read,write} use a seqcount to protect reads from accessing incomple writes. Concurrent i_size_write()s require mutual exclussion to protect the seqcount that is used by i_size_{read,write}. But i_size_read() callers do not need to use additional locking. Signed-off-by: Mike Snitzer Acked-by: NeilBrown Acked-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/md/md.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 4e957f3140a8..324a3663fcda 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -706,7 +706,7 @@ static struct mdk_personality *find_pers(int level, char *clevel) /* return the offset of the super block in 512byte sectors */ static inline sector_t calc_dev_sboffset(struct block_device *bdev) { - sector_t num_sectors = bdev->bd_inode->i_size / 512; + sector_t num_sectors = i_size_read(bdev->bd_inode) / 512; return MD_NEW_SIZE_SECTORS(num_sectors); } @@ -1386,7 +1386,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) */ switch(minor_version) { case 0: - sb_start = rdev->bdev->bd_inode->i_size >> 9; + sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; sb_start -= 8*2; sb_start &= ~(sector_t)(4*2-1); break; @@ -1472,7 +1472,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) ret = 0; } if (minor_version) - rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) - + rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - le64_to_cpu(sb->data_offset); else rdev->sectors = rdev->sb_start; @@ -1680,7 +1680,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) return 0; /* component must fit device */ if (rdev->sb_start < rdev->data_offset) { /* minor versions 1 and 2; superblock before data */ - max_sectors = rdev->bdev->bd_inode->i_size >> 9; + max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; max_sectors -= rdev->data_offset; if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; @@ -1690,7 +1690,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) } else { /* minor version 0; superblock after data */ sector_t sb_start; - sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; + sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; sb_start &= ~(sector_t)(4*2 - 1); max_sectors = rdev->sectors + sb_start - rdev->sb_start; if (!num_sectors || num_sectors > max_sectors) @@ -2584,7 +2584,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) if (!sectors) return -EBUSY; } else if (!sectors) - sectors = (rdev->bdev->bd_inode->i_size >> 9) - + sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - rdev->data_offset; } if (sectors < my_mddev->dev_sectors) @@ -2797,7 +2797,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi kobject_init(&rdev->kobj, &rdev_ktype); - size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; + size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; if (!size) { printk(KERN_WARNING "md: %s has zero or unknown size, marking faulty!\n", @@ -5235,8 +5235,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) if (!mddev->persistent) { printk(KERN_INFO "md: nonpersistent superblock ...\n"); - rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; - } else + rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; + } else rdev->sb_start = calc_dev_sboffset(rdev->bdev); rdev->sectors = rdev->sb_start; @@ -5306,7 +5306,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev) if (mddev->persistent) rdev->sb_start = calc_dev_sboffset(rdev->bdev); else - rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; + rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; rdev->sectors = rdev->sb_start; -- cgit From c26a44ed1e552aaa1d4ceb71842002d235fe98d7 Mon Sep 17 00:00:00 2001 From: Justin Maggard Date: Wed, 24 Nov 2010 16:36:17 +1100 Subject: md: fix return value of rdev_size_change() When trying to grow an array by enlarging component devices, rdev_size_store() expects the return value of rdev_size_change() to be in sectors, but the actual value is returned in KBs. This functionality was broken by commit dd8ac336c13fd8afdb082ebacb1cddd5cf727889 so this patch is suitable for any kernel since 2.6.30. Cc: stable@kernel.org Signed-off-by: Justin Maggard Signed-off-by: NeilBrown --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 324a3663fcda..7b9e229a05e2 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1337,7 +1337,7 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); md_super_wait(rdev->mddev); - return num_sectors / 2; /* kB for sysfs */ + return num_sectors; } @@ -1704,7 +1704,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, rdev->sb_page); md_super_wait(rdev->mddev); - return num_sectors / 2; /* kB for sysfs */ + return num_sectors; } static struct super_type super_types[] = { -- cgit From be20e6c67b6c6024c19094d3447f144d118733b0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 24 Nov 2010 16:40:33 +1100 Subject: md: Call blk_queue_flush() to establish flush/fua support Before 2.6.37, the md layer had a mechanism for catching I/Os with the barrier flag set, and translating the barrier into barriers for all the underlying devices. With 2.6.37, I/O barriers have become plain old flushes, and the md code was updated to reflect this. However, one piece was left out -- the md layer does not tell the block layer that it supports flushes or FUA access at all, which results in md silently dropping flush requests. Since the support already seems there, just add this one piece of bookkeeping. Signed-off-by: Darrick J. Wong Signed-off-by: NeilBrown --- drivers/md/md.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 7b9e229a05e2..84c46a161927 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4338,6 +4338,8 @@ static int md_alloc(dev_t dev, char *name) if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_bitmap_group)) printk(KERN_DEBUG "pointless warning\n"); + + blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); abort: mutex_unlock(&disks_mutex); if (!error && mddev->kobj.sd) { -- cgit From 2b74e12e567feb4163e32815bce0be57489e73b9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 9 Dec 2010 15:59:01 +1100 Subject: md: remove handling of flush_pending in md_submit_flush_data None of the functions called between setting flush_pending to 1, and atomic_dec_and_test can change flush_pending, or will anything running in any other thread (as ->flush_bio is not NULL). So the atomic_dec_and_test will always succeed. So remove the atomic_sec and the atomic_dec_and_test. Signed-off-by: NeilBrown --- drivers/md/md.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 84c46a161927..83b6cb3e7025 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -404,8 +404,6 @@ static void md_submit_flush_data(struct work_struct *ws) mddev_t *mddev = container_of(ws, mddev_t, flush_work); struct bio *bio = mddev->flush_bio; - atomic_set(&mddev->flush_pending, 1); - if (bio->bi_size == 0) /* an empty barrier - all done */ bio_endio(bio, 0); @@ -414,10 +412,9 @@ static void md_submit_flush_data(struct work_struct *ws) if (mddev->pers->make_request(mddev, bio)) generic_make_request(bio); } - if (atomic_dec_and_test(&mddev->flush_pending)) { - mddev->flush_bio = NULL; - wake_up(&mddev->sb_wait); - } + + mddev->flush_bio = NULL; + wake_up(&mddev->sb_wait); } void md_flush_request(mddev_t *mddev, struct bio *bio) -- cgit From a7a07e69653acf8540daa1da053cd84bf86e8e66 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 9 Dec 2010 16:04:25 +1100 Subject: md: move code in to submit_flushes. submit_flushes is called from exactly one place. Move the code that is before and after that call into submit_flushes. This has not functional change, but will make the next patch smaller and easier to follow. Signed-off-by: NeilBrown --- drivers/md/md.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 83b6cb3e7025..31f8e151d893 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -371,10 +371,14 @@ static void md_end_flush(struct bio *bio, int err) bio_put(bio); } +static void md_submit_flush_data(struct work_struct *ws); + static void submit_flushes(mddev_t *mddev) { mdk_rdev_t *rdev; + INIT_WORK(&mddev->flush_work, md_submit_flush_data); + atomic_set(&mddev->flush_pending, 1); rcu_read_lock(); list_for_each_entry_rcu(rdev, &mddev->disks, same_set) if (rdev->raid_disk >= 0 && @@ -397,6 +401,8 @@ static void submit_flushes(mddev_t *mddev) rdev_dec_pending(rdev, mddev); } rcu_read_unlock(); + if (atomic_dec_and_test(&mddev->flush_pending)) + queue_work(md_wq, &mddev->flush_work); } static void md_submit_flush_data(struct work_struct *ws) @@ -426,13 +432,7 @@ void md_flush_request(mddev_t *mddev, struct bio *bio) mddev->flush_bio = bio; spin_unlock_irq(&mddev->write_lock); - atomic_set(&mddev->flush_pending, 1); - INIT_WORK(&mddev->flush_work, md_submit_flush_data); - submit_flushes(mddev); - - if (atomic_dec_and_test(&mddev->flush_pending)) - queue_work(md_wq, &mddev->flush_work); } EXPORT_SYMBOL(md_flush_request); -- cgit From a035fc3e2531703b539f23bec4ca7943cfc69349 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 9 Dec 2010 16:17:51 +1100 Subject: md: fix possible deadlock in handling flush requests. As recorded in https://bugzilla.kernel.org/show_bug.cgi?id=24012 it is possible for a flush request through md to hang. This is due to an interaction between the recursion avoidance in generic_make_request, the insistence in md of only having one flush active at a time, and the possibility of dm (or md) submitting two flush requests to a device from the one generic_make_request. If a generic_make_request call into dm causes two flush requests to be queued (as happens if the dm table has two targets - they get one each), these two will be queued inside generic_make_request. Assume they are for the same md device. The first is processed and causes 1 or more flush requests to be sent to lower devices. These get queued within generic_make_request too. Then the second flush to the md device gets handled and it blocks waiting for the first flush to complete. But it won't complete until the two lower-device requests complete, and they haven't even been submitted yet as they are on the generic_make_request queue. The deadlock can be broken by using a separate thread to submit the requests to lower devices. md has such a thread readily available: md_wq. So use it to submit these requests. Reported-by: Giacomo Catenazzi Tested-by: Giacomo Catenazzi Signed-off-by: NeilBrown --- drivers/md/md.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 31f8e151d893..d66aaeddf95d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -373,8 +373,9 @@ static void md_end_flush(struct bio *bio, int err) static void md_submit_flush_data(struct work_struct *ws); -static void submit_flushes(mddev_t *mddev) +static void submit_flushes(struct work_struct *ws) { + mddev_t *mddev = container_of(ws, mddev_t, flush_work); mdk_rdev_t *rdev; INIT_WORK(&mddev->flush_work, md_submit_flush_data); @@ -432,7 +433,8 @@ void md_flush_request(mddev_t *mddev, struct bio *bio) mddev->flush_bio = bio; spin_unlock_irq(&mddev->write_lock); - submit_flushes(mddev); + INIT_WORK(&mddev->flush_work, submit_flushes); + queue_work(md_wq, &mddev->flush_work); } EXPORT_SYMBOL(md_flush_request); -- cgit From 1a855a0606653d2d82506281e2c686bacb4b2f45 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 9 Dec 2010 16:36:28 +1100 Subject: md: fix bug with re-adding of partially recovered device. With v0.90 metadata, a hot-spare does not become a full member of the array until recovery is complete. So if we re-add such a device to the array, we know that all of it is as up-to-date as the event count would suggest, and so it a bitmap-based recovery is possible. However with v1.x metadata, the hot-spare immediately becomes a full member of the array, but it record how much of the device has been recovered. If the array is stopped and re-assembled recovery starts from this point. When such a device is hot-added to an array we currently lose the 'how much is recovered' information and incorrectly included it as a full in-sync member (after bitmap-based fixup). This is wrong and unsafe and could corrupt data. So be more careful about setting saved_raid_disk - which is what guides the re-adding of devices back into an array. The new code matches the code in slot_store which does a similar thing, which is encouraging. This is suitable for any -stable kernel. Reported-by: "Dailey, Nate" Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/md.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index d66aaeddf95d..b757da175180 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5159,7 +5159,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) PTR_ERR(rdev)); return PTR_ERR(rdev); } - /* set save_raid_disk if appropriate */ + /* set saved_raid_disk if appropriate */ if (!mddev->persistent) { if (info->state & (1<raid_disk < mddev->raid_disks) @@ -5169,7 +5169,10 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) } else super_types[mddev->major_version]. validate_super(mddev, rdev); - rdev->saved_raid_disk = rdev->raid_disk; + if (test_bit(In_sync, &rdev->flags)) + rdev->saved_raid_disk = rdev->raid_disk; + else + rdev->saved_raid_disk = -1; clear_bit(In_sync, &rdev->flags); /* just to be sure */ if (info->state & (1< Date: Thu, 9 Dec 2010 17:02:14 +1100 Subject: md: protect against NULL reference when waiting to start a raid10. When we fail to start a raid10 for some reason, we call md_unregister_thread to kill the thread that was created. Unfortunately md_thread() will then make one call into the handler (raid10d) even though md_wakeup_thread has not been called. This is not safe and as md_unregister_thread is called after mddev->private has been set to NULL, it will definitely cause a NULL dereference. So fix this at both ends: - md_thread should only call the handler if THREAD_WAKEUP has been set. - raid10 should call md_unregister_thread before setting things to NULL just like all the other raid modules do. This is applicable to 2.6.35 and later. Cc: stable@kernel.org Reported-by: "Citizen" Signed-off-by: NeilBrown --- drivers/md/md.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index b757da175180..e71c5fa527f5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6044,9 +6044,8 @@ static int md_thread(void * arg) || kthread_should_stop(), thread->timeout); - clear_bit(THREAD_WAKEUP, &thread->flags); - - thread->run(thread->mddev); + if (test_and_clear_bit(THREAD_WAKEUP, &thread->flags)) + thread->run(thread->mddev); } return 0; -- cgit From e692cb668fdd5a712c6ed2a2d6f2a36ee83997b4 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 1 Dec 2010 19:41:49 +0100 Subject: block: Deprecate QUEUE_FLAG_CLUSTER and use queue_limits instead When stacking devices, a request_queue is not always available. This forced us to have a no_cluster flag in the queue_limits that could be used as a carrier until the request_queue had been set up for a metadevice. There were several problems with that approach. First of all it was up to the stacking device to remember to set queue flag after stacking had completed. Also, the queue flag and the queue limits had to be kept in sync at all times. We got that wrong, which could lead to us issuing commands that went beyond the max scatterlist limit set by the driver. The proper fix is to avoid having two flags for tracking the same thing. We deprecate QUEUE_FLAG_CLUSTER and use the queue limit directly in the block layer merging functions. The queue_limit 'no_cluster' is turned into 'cluster' to avoid double negatives and to ease stacking. Clustering defaults to being enabled as before. The queue flag logic is removed from the stacking function, and explicitly setting the cluster flag is no longer necessary in DM and MD. Reported-by: Ed Lin Signed-off-by: Martin K. Petersen Acked-by: Mike Snitzer Cc: stable@kernel.org Signed-off-by: Jens Axboe --- drivers/md/md.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 84c46a161927..52694d29663d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4296,9 +4296,6 @@ static int md_alloc(dev_t dev, char *name) goto abort; mddev->queue->queuedata = mddev; - /* Can be unlocked because the queue is new: no concurrency */ - queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); - blk_queue_make_request(mddev->queue, md_make_request); disk = alloc_disk(1 << shift); -- cgit