From c6eda5e81c4fcc77185117255c7419eda771f67f Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 31 Jan 2014 14:11:54 -0500 Subject: dm cache: move hook_info into common portion of per_bio_data structure Commit c9d28d5d ("dm cache: promotion optimisation for writes") incorrectly placed the 'hook_info' member in the writethrough-only portion of the per_bio_data structure. Given that the overwrite optimization may be used for writeback the 'hook_info' member must be placed above the 'cache' member of the per_bio_data structure. Any members above 'cache' are available from both writeback and writethrough modes' per_bio_data structure. Signed-off-by: Mike Snitzer Acked-by: Joe Thornber Cc: stable@vger.kernel.org # 3.13+ --- drivers/md/dm-cache-target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index ffd472e015ca..14256b7fce79 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -289,6 +289,7 @@ struct per_bio_data { bool tick:1; unsigned req_nr:2; struct dm_deferred_entry *all_io_entry; + struct dm_hook_info hook_info; /* * writethrough fields. These MUST remain at the end of this @@ -297,7 +298,6 @@ struct per_bio_data { */ struct cache *cache; dm_cblock_t cblock; - struct dm_hook_info hook_info; struct dm_bio_details bio_details; }; -- cgit v1.2.3 From 80ae49aaed32af72630251eb06161b15cde15ac8 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 31 Jan 2014 14:30:37 -0500 Subject: dm cache: do not add migration to completed list before unhooking bio When completing an overwrite bio, in overwrite_endio(), the associated migration should not be added to the 'completed_migrations' until the bio's fields are restored with dm_unhook_bio(). Otherwise, do_worker() can race to process 'completed_migrations' before dm_unhook_bio() -- so the bio's bi_end_io is incorrect. This is unlikely to cause any problems given the current code but should be fixed on the basis of correctness. Also, the cache's spinlock only needs to be held when manipulating the 'completed_migrations' list -- other changes don't need protection. Signed-off-by: Mike Snitzer Acked-by: Joe Thornber --- drivers/md/dm-cache-target.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 14256b7fce79..db0944465127 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1010,13 +1010,15 @@ static void overwrite_endio(struct bio *bio, int err) struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); unsigned long flags; + dm_unhook_bio(&pb->hook_info, bio); + if (err) mg->err = true; + mg->requeue_holder = false; + spin_lock_irqsave(&cache->lock, flags); list_add_tail(&mg->list, &cache->completed_migrations); - dm_unhook_bio(&pb->hook_info, bio); - mg->requeue_holder = false; spin_unlock_irqrestore(&cache->lock, flags); wake_worker(cache); -- cgit v1.2.3 From 4d1662a30dde6e545086fe0e8fd7e474c4e0b639 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 6 Feb 2014 06:08:56 -0500 Subject: dm thin: avoid metadata commit if a pool's thin devices haven't changed Commit 905e51b ("dm thin: commit outstanding data every second") introduced a periodic commit. This commit occurs regardless of whether any thin devices have made changes. Fix the periodic commit to check if any of a pool's thin devices have changed using dm_pool_changed_this_transaction(). Reported-by: Alexander Larsson Signed-off-by: Mike Snitzer Acked-by: Joe Thornber Cc: stable@vger.kernel.org --- drivers/md/dm-thin-metadata.c | 17 +++++++++++++++++ drivers/md/dm-thin-metadata.h | 2 ++ drivers/md/dm-thin.c | 3 ++- 3 files changed, 21 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 7da347665552..3bb4506582a9 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1489,6 +1489,23 @@ bool dm_thin_changed_this_transaction(struct dm_thin_device *td) return r; } +bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd) +{ + bool r = false; + struct dm_thin_device *td, *tmp; + + down_read(&pmd->root_lock); + list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { + if (td->changed) { + r = td->changed; + break; + } + } + up_read(&pmd->root_lock); + + return r; +} + bool dm_thin_aborted_changes(struct dm_thin_device *td) { bool r; diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 9a368567632f..dd6088a17a65 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -161,6 +161,8 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block); */ bool dm_thin_changed_this_transaction(struct dm_thin_device *td); +bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd); + bool dm_thin_aborted_changes(struct dm_thin_device *td); int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index faaf944597ab..d501e436a7d9 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1357,7 +1357,8 @@ static void process_deferred_bios(struct pool *pool) bio_list_init(&pool->deferred_flush_bios); spin_unlock_irqrestore(&pool->lock, flags); - if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) + if (bio_list_empty(&bios) && + !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool))) return; if (commit(pool)) { -- cgit v1.2.3 From d73f9907294c670da14baea3fb31be27879e818e Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 12 Feb 2014 16:37:30 -0500 Subject: dm io: fix I/O to multiple destinations Commit 003b5c5719f159f4f4bf97511c4702a0638313dd ("block: Convert drivers to immutable biovecs") broke dm-mirror due to dm-io breakage. dm-io had three possible iterators (DM_IO_PAGE_LIST, DM_IO_BVEC, DM_IO_VMA) that iterate over pages where the I/O should be performed. The switch to immutable biovecs changed the DM_IO_BVEC iterator to DM_IO_BIO. Before this change the iterator stored the pointer to a bio vector in the dpages structure. The iterator incremented the pointer in the dpages structure as it advanced over the pages. After the immutable biovecs change, the DM_IO_BIO iterator stores a pointer to the bio in the dpages structure and uses bio_advance to change the bio as it advances. The problem is that the function dispatch_io stores the content of the dpages structure into the variable old_pages and restores it before issuing I/O to each of the devices. Before the change, the statement "*dp = old_pages;" restored the iterator to its starting position. After the change, struct dpages holds a pointer to the bio, thus the statement "*dp = old_pages;" doesn't restore the iterator. Consequently, in the context of dm-mirror: only the first mirror leg is written correctly, the kernel locks up when trying to write the other mirror legs because the number of sectors to write in the where->count variable doesn't match the number of sectors returned by the iterator. This patch fixes the bug by partially reverting the original patch - it changes the code so that struct dpages holds a pointer to the bio vector, so that the statement "*dp = old_pages;" restores the iterator correctly. The field "context_u" holds the offset from the beginning of the current bio vector entry, just like the "bio->bi_iter.bi_bvec_done" field. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-io.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index b2b8a10e8427..3842ac738f98 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -201,29 +201,28 @@ static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offse /* * Functions for getting the pages from a bvec. */ -static void bio_get_page(struct dpages *dp, - struct page **p, unsigned long *len, unsigned *offset) +static void bio_get_page(struct dpages *dp, struct page **p, + unsigned long *len, unsigned *offset) { - struct bio *bio = dp->context_ptr; - struct bio_vec bvec = bio_iovec(bio); - *p = bvec.bv_page; - *len = bvec.bv_len; - *offset = bvec.bv_offset; + struct bio_vec *bvec = dp->context_ptr; + *p = bvec->bv_page; + *len = bvec->bv_len - dp->context_u; + *offset = bvec->bv_offset + dp->context_u; } static void bio_next_page(struct dpages *dp) { - struct bio *bio = dp->context_ptr; - struct bio_vec bvec = bio_iovec(bio); - - bio_advance(bio, bvec.bv_len); + struct bio_vec *bvec = dp->context_ptr; + dp->context_ptr = bvec + 1; + dp->context_u = 0; } static void bio_dp_init(struct dpages *dp, struct bio *bio) { dp->get_page = bio_get_page; dp->next_page = bio_next_page; - dp->context_ptr = bio; + dp->context_ptr = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); + dp->context_u = bio->bi_iter.bi_bvec_done; } /* -- cgit v1.2.3 From f3a44fe0608eb628e1f8f6e3540462e6d171a745 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 18 Feb 2014 09:57:22 -0500 Subject: dm raid1: fix immutable biovec related BUG when retrying read bio When restoring bi_end_io, increase bi_remaining before retrying the bio to avoid BUG_ON(atomic_read(&bio->bi_remaining) <= 0) in bio_endio(). Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-raid1.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index f284e0bfb25f..7dfdb5c746d6 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1244,6 +1244,9 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error) dm_bio_restore(bd, bio); bio_record->details.bi_bdev = NULL; + + atomic_inc(&bio->bi_remaining); + queue_bio(ms, bio, rw); return DM_ENDIO_INCOMPLETE; } -- cgit v1.2.3 From 1acacc0784aab45627b6009e0e9224886279ac0b Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 19 Feb 2014 20:32:33 -0500 Subject: dm thin: fix the error path for the thin device constructor dm_pool_close_thin_device() must be called if dm_set_target_max_io_len() fails in thin_ctr(). Otherwise __pool_destroy() will fail because the pool will still have an open thin device: device-mapper: thin metadata: attempt to close pmd when 1 device(s) are still open device-mapper: thin: __pool_destroy: dm_pool_metadata_close() failed. Also, must establish error code if failing thin_ctr() because the pool is in fail_io mode. Signed-off-by: Mike Snitzer Acked-by: Joe Thornber Cc: stable@vger.kernel.org --- drivers/md/dm-thin.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index d501e436a7d9..4cf4b198cb60 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2895,6 +2895,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) if (get_pool_mode(tc->pool) == PM_FAIL) { ti->error = "Couldn't open thin device, Pool is in fail mode"; + r = -EINVAL; goto bad_thin_open; } @@ -2906,7 +2907,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block); if (r) - goto bad_thin_open; + goto bad_target_max_io_len; ti->num_flush_bios = 1; ti->flush_supported = true; @@ -2927,6 +2928,8 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) return 0; +bad_target_max_io_len: + dm_pool_close_thin_device(tc->td); bad_thin_open: __pool_dec(tc->pool); bad_pool_lookup: -- cgit v1.2.3 From a1989b330093578ea5470bea0a00f940c444c466 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 26 Feb 2014 10:07:04 +0100 Subject: dm mpath: fix stalls when handling invalid ioctls An invalid ioctl will never be valid, irrespective of whether multipath has active paths or not. So for invalid ioctls we do not have to wait for multipath to activate any paths, but can rather return an error code immediately. This fix resolves numerous instances of: udevd[]: worker [] unexpectedly returned with status 0x0100 that have been seen during testing. Signed-off-by: Hannes Reinecke Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/dm-mpath.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 6eb9dc9ef8f3..422a9fdeb53e 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1626,8 +1626,11 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd, /* * Only pass ioctls through if the device sizes match exactly. */ - if (!r && ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) - r = scsi_verify_blk_ioctl(NULL, cmd); + if (!bdev || ti->len != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT) { + int err = scsi_verify_blk_ioctl(NULL, cmd); + if (err) + r = err; + } if (r == -ENOTCONN && !fatal_signal_pending(current)) queue_work(kmultipathd, &m->process_queued_ios); -- cgit v1.2.3 From 7d48935eff401bb7970e73e822871a10e3643df1 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 12 Feb 2014 23:58:15 -0500 Subject: dm thin: allow metadata space larger than supported to go unused It was always intended that a user could provide a thin metadata device that is larger than the max supported by the on-disk format. The extra space would just go unused. Unfortunately that never worked. If the user attempted to use a larger metadata device on creation they would get an error like the following: device-mapper: space map common: space map too large device-mapper: transaction manager: couldn't create metadata space map device-mapper: thin metadata: tm_create_with_sm failed device-mapper: table: 252:17: thin-pool: Error creating metadata object device-mapper: ioctl: error adding target to table Fix this by allowing the initial metadata space map creation to cap its size at the max number of blocks supported (DM_SM_METADATA_MAX_BLOCKS). get_metadata_dev_size() must also impose DM_SM_METADATA_MAX_BLOCKS (via THIN_METADATA_MAX_SECTORS), otherwise extending metadata would cap at THIN_METADATA_MAX_SECTORS_WARNING (which is larger than supported). Also, the calculation for THIN_METADATA_MAX_SECTORS didn't account for the sizeof the disk_bitmap_header. So the supported maximum metadata size is a bit smaller (reduced from 33423360 to 33292800 sectors). Lastly, remove the "excess space will not be used" warning message from get_metadata_dev_size(); it resulted in printing the warning multiple times. Factor out warn_if_metadata_device_too_big(), call it from pool_ctr() and maybe_resize_metadata_dev(). Signed-off-by: Mike Snitzer Acked-by: Joe Thornber --- drivers/md/dm-thin-metadata.c | 4 +-- drivers/md/dm-thin-metadata.h | 8 +++--- drivers/md/dm-thin.c | 31 +++++++++++++--------- drivers/md/persistent-data/dm-space-map-metadata.c | 2 ++ drivers/md/persistent-data/dm-space-map-metadata.h | 11 ++++++++ 5 files changed, 37 insertions(+), 19 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 3bb4506582a9..baa87ff12816 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -483,7 +483,7 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd) disk_super->data_mapping_root = cpu_to_le64(pmd->root); disk_super->device_details_root = cpu_to_le64(pmd->details_root); - disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); + disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE); disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); disk_super->data_block_size = cpu_to_le32(pmd->data_block_size); @@ -651,7 +651,7 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f { int r; - pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE, + pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, THIN_METADATA_CACHE_SIZE, THIN_MAX_CONCURRENT_LOCKS); if (IS_ERR(pmd->bm)) { diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index dd6088a17a65..82ea384d36ff 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -9,16 +9,14 @@ #include "persistent-data/dm-block-manager.h" #include "persistent-data/dm-space-map.h" +#include "persistent-data/dm-space-map-metadata.h" -#define THIN_METADATA_BLOCK_SIZE 4096 +#define THIN_METADATA_BLOCK_SIZE DM_SM_METADATA_BLOCK_SIZE /* * The metadata device is currently limited in size. - * - * We have one block of index, which can hold 255 index entries. Each - * index entry contains allocation info about 16k metadata blocks. */ -#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) +#define THIN_METADATA_MAX_SECTORS DM_SM_METADATA_MAX_SECTORS /* * A metadata device larger than 16GB triggers a warning. diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 4cf4b198cb60..7e84baccf0ad 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2000,16 +2000,27 @@ static void metadata_low_callback(void *context) dm_table_event(pool->ti->table); } -static sector_t get_metadata_dev_size(struct block_device *bdev) +static sector_t get_dev_size(struct block_device *bdev) +{ + return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; +} + +static void warn_if_metadata_device_too_big(struct block_device *bdev) { - sector_t metadata_dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; + sector_t metadata_dev_size = get_dev_size(bdev); char buffer[BDEVNAME_SIZE]; - if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) { + if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS); - metadata_dev_size = THIN_METADATA_MAX_SECTORS_WARNING; - } +} + +static sector_t get_metadata_dev_size(struct block_device *bdev) +{ + sector_t metadata_dev_size = get_dev_size(bdev); + + if (metadata_dev_size > THIN_METADATA_MAX_SECTORS) + metadata_dev_size = THIN_METADATA_MAX_SECTORS; return metadata_dev_size; } @@ -2018,7 +2029,7 @@ static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev) { sector_t metadata_dev_size = get_metadata_dev_size(bdev); - sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); + sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE); return metadata_dev_size; } @@ -2096,12 +2107,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->error = "Error opening metadata block device"; goto out_unlock; } - - /* - * Run for the side-effect of possibly issuing a warning if the - * device is too big. - */ - (void) get_metadata_dev_size(metadata_dev->bdev); + warn_if_metadata_device_too_big(metadata_dev->bdev); r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); if (r) { @@ -2288,6 +2294,7 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit) return -EINVAL; } else if (metadata_dev_size > sb_metadata_dev_size) { + warn_if_metadata_device_too_big(pool->md_dev); DMINFO("%s: growing the metadata device from %llu to %llu blocks", dm_device_name(pool->pool_md), sb_metadata_dev_size, metadata_dev_size); diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 536782e3bcb7..e9bdd462f4f5 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -680,6 +680,8 @@ int dm_sm_metadata_create(struct dm_space_map *sm, if (r) return r; + if (nr_blocks > DM_SM_METADATA_MAX_BLOCKS) + nr_blocks = DM_SM_METADATA_MAX_BLOCKS; r = sm_ll_extend(&smm->ll, nr_blocks); if (r) return r; diff --git a/drivers/md/persistent-data/dm-space-map-metadata.h b/drivers/md/persistent-data/dm-space-map-metadata.h index 39bba0801cf2..64df923974d8 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.h +++ b/drivers/md/persistent-data/dm-space-map-metadata.h @@ -9,6 +9,17 @@ #include "dm-transaction-manager.h" +#define DM_SM_METADATA_BLOCK_SIZE (4096 >> SECTOR_SHIFT) + +/* + * The metadata device is currently limited in size. + * + * We have one block of index, which can hold 255 index entries. Each + * index entry contains allocation info about ~16k metadata blocks. + */ +#define DM_SM_METADATA_MAX_BLOCKS (255 * ((1 << 14) - 64)) +#define DM_SM_METADATA_MAX_SECTORS (DM_SM_METADATA_MAX_BLOCKS * DM_SM_METADATA_BLOCK_SIZE) + /* * Unfortunately we have to use two-phase construction due to the cycle * between the tm and sm. -- cgit v1.2.3 From e0d849fad746cb36a6822e4595d8ba9bf0adf7fa Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Thu, 27 Feb 2014 22:46:48 +0100 Subject: dm cache: fix truncation bug when mapping I/O to >2TB fast device When remapping a block to the cache's fast device that is larger than 2TB we must not truncate the destination sector to 32bits. The 32bit temporary result of from_cblock() was being overflowed in remap_to_cache() due to the logical left shift. Use an intermediate 64bit type to store the 32bit from_cblock() result to fix the overflow. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/dm-cache-target.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index db0944465127..1af70145fab9 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -671,15 +671,16 @@ static void remap_to_cache(struct cache *cache, struct bio *bio, dm_cblock_t cblock) { sector_t bi_sector = bio->bi_iter.bi_sector; + sector_t block = from_cblock(cblock); bio->bi_bdev = cache->cache_dev->bdev; if (!block_size_is_power_of_two(cache)) bio->bi_iter.bi_sector = - (from_cblock(cblock) * cache->sectors_per_block) + + (block * cache->sectors_per_block) + sector_div(bi_sector, cache->sectors_per_block); else bio->bi_iter.bi_sector = - (from_cblock(cblock) << cache->sectors_per_block_shift) | + (block << cache->sectors_per_block_shift) | (bi_sector & (cache->sectors_per_block - 1)); } -- cgit v1.2.3 From 14f398ca2f26a2ed6236aec54395e0fa06ec8a82 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Fri, 28 Feb 2014 12:02:56 -0500 Subject: dm cache mq: fix memory allocation failure for large cache devices The memory allocated for the multiqueue policy's hash table doesn't need to be physically contiguous. Use vzalloc() instead of kzalloc(). Fedora has been carrying this fix since 10/10/2013. Failure seen during creation of a 10TB cached device with a 2048 sector block size and 411GB cache size: dmsetup: page allocation failure: order:9, mode:0x10c0d0 CPU: 11 PID: 29235 Comm: dmsetup Not tainted 3.10.4 #3 Hardware name: Supermicro X8DTL/X8DTL, BIOS 2.1a 12/30/2011 000000000010c0d0 ffff880090941898 ffffffff81387ab4 ffff880090941928 ffffffff810bb26f 0000000000000009 000000000010c0d0 ffff880090941928 ffffffff81385dbc ffffffff815f3840 ffffffff00000000 000002000010c0d0 Call Trace: [] dump_stack+0x19/0x1b [] warn_alloc_failed+0x110/0x124 [] ? __alloc_pages_direct_compact+0x17c/0x18e [] __alloc_pages_nodemask+0x6c7/0x75e [] __get_free_pages+0x12/0x3f [] kmalloc_order_trace+0x29/0x88 [] __kmalloc+0x36/0x11b [] ? mq_create+0x1dc/0x2cf [dm_cache_mq] [] mq_create+0x2af/0x2cf [dm_cache_mq] [] dm_cache_policy_create+0xa7/0xd2 [dm_cache] [] ? cache_ctr+0x245/0xa13 [dm_cache] [] cache_ctr+0x353/0xa13 [dm_cache] [] dm_table_add_target+0x227/0x2ce [dm_mod] [] table_load+0x286/0x2ac [dm_mod] [] ? dev_wait+0x8a/0x8a [dm_mod] [] ctl_ioctl+0x39a/0x3c2 [dm_mod] [] dm_ctl_ioctl+0xe/0x12 [dm_mod] [] vfs_ioctl+0x21/0x34 [] do_vfs_ioctl+0x3b1/0x3f4 [] ? ____fput+0x9/0xb [] ? task_work_run+0x7e/0x92 [] SyS_ioctl+0x52/0x82 [] system_call_fastpath+0x16/0x1b Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/dm-cache-policy-mq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 1e018e986610..0e385e40909e 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -872,7 +872,7 @@ static void mq_destroy(struct dm_cache_policy *p) { struct mq_policy *mq = to_mq_policy(p); - kfree(mq->table); + vfree(mq->table); epool_exit(&mq->cache_pool); epool_exit(&mq->pre_cache_pool); kfree(mq); @@ -1245,7 +1245,7 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); mq->hash_bits = ffs(mq->nr_buckets) - 1; - mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL); + mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets); if (!mq->table) goto bad_alloc_table; -- cgit v1.2.3 From c64d240df31513522901539a3206a41e9e6d00c8 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 3 Mar 2014 10:57:56 -0500 Subject: dm: fix Kconfig indentation Since DM_DEBUG_BLOCK_STACK_TRACING is a DM_PERSISTENT_DATA config option move it from drivers/md/Kconfig to drivers/md/persistent-data/Kconfig. Doing so fixes indentation for other DM config options. Signed-off-by: Mike Snitzer --- drivers/md/Kconfig | 10 ---------- drivers/md/persistent-data/Kconfig | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 9a06fe883766..95ad936e6048 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -254,16 +254,6 @@ config DM_THIN_PROVISIONING ---help--- Provides thin provisioning and snapshots that share a data store. -config DM_DEBUG_BLOCK_STACK_TRACING - boolean "Keep stack trace of persistent data block lock holders" - depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA - select STACKTRACE - ---help--- - Enable this for messages that may help debug problems with the - block manager locking used by thin provisioning and caching. - - If unsure, say N. - config DM_CACHE tristate "Cache target (EXPERIMENTAL)" depends on BLK_DEV_DM diff --git a/drivers/md/persistent-data/Kconfig b/drivers/md/persistent-data/Kconfig index 19b268795415..0c2dec7aec20 100644 --- a/drivers/md/persistent-data/Kconfig +++ b/drivers/md/persistent-data/Kconfig @@ -6,3 +6,13 @@ config DM_PERSISTENT_DATA ---help--- Library providing immutable on-disk data structure support for device-mapper targets such as the thin provisioning target. + +config DM_DEBUG_BLOCK_STACK_TRACING + boolean "Keep stack trace of persistent data block lock holders" + depends on STACKTRACE_SUPPORT && DM_PERSISTENT_DATA + select STACKTRACE + ---help--- + Enable this for messages that may help debug problems with the + block manager locking used by thin provisioning and caching. + + If unsure, say N. -- cgit v1.2.3 From 2c945820cab96ebf265598d998e63ef22393d0d4 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 3 Mar 2014 17:19:22 -0500 Subject: dm snapshot: fix metadata corruption Commit 55494bf2947dccdf2 ("dm snapshot: use dm-bufio") broke snapshots. Before that 3.14-rc1 commit, loading a snapshot's list of exceptions involved reading exception areas one by one into ps->area and inserting those exceptions into the hash table. Commit 55494bf2947dccdf2 changed it so that dm-bufio with prefetch is used to load exceptions in batchs. Exceptions are loaded correctly, but ps->area is left uninitialized. When a new exception is allocated, it is stored in this uninitialized ps->area which will be written to the disk. This causes metadata corruption. Fix this corruption by copying the last area that was read via dm-bufio into ps->area. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-snap-persistent.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index afc3d017de4c..d6e88178d22c 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -546,6 +546,9 @@ static int read_exceptions(struct pstore *ps, r = insert_exceptions(ps, area, callback, callback_context, &full); + if (!full) + memcpy(ps->area, area, ps->store->chunk_size << SECTOR_SHIFT); + dm_bufio_release(bp); dm_bufio_forget(client, chunk); -- cgit v1.2.3 From cdc2b4158405f1975f9d5205096f08430eda1c0e Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 14 Feb 2014 18:10:55 -0500 Subject: dm thin: synchronize the pool mode during suspend Commit b5330655 ("dm thin: handle metadata failures more consistently") increased potential for the pool's mode to be changed in response to metadata operation failures. When the pool mode is changed it isn't synchronized with the mode in pool_features stored in the target's context (ti->private) that is used as the basis for (re)establishing the pool mode during resume via bind_control_target. It is important that we synchronize the pool mode when it is changed otherwise the pool may experience and unexpected mode transition on the next resume (especially if there was no new table load). Signed-off-by: Mike Snitzer Acked-by: Joe Thornber --- drivers/md/dm-thin.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 7e84baccf0ad..dfa48ec7b8ea 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1402,6 +1402,7 @@ static enum pool_mode get_pool_mode(struct pool *pool) static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) { int r; + struct pool_c *pt = pool->ti->private; enum pool_mode old_mode = pool->pf.mode; switch (new_mode) { @@ -1448,6 +1449,11 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) } pool->pf.mode = new_mode; + /* + * The pool mode may have changed, sync it so bind_control_target() + * doesn't cause an unexpected mode transition on resume. + */ + pt->adjusted_pf.mode = new_mode; } /* -- cgit v1.2.3 From 07f2b6e0382ec4c59887d5954683f1a0b265574e Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 14 Feb 2014 11:58:41 -0500 Subject: dm thin: ensure user takes action to validate data and metadata consistency If a thin metadata operation fails the current transaction will abort, whereby causing potential for IO layers up the stack (e.g. filesystems) to have data loss. As such, set THIN_METADATA_NEEDS_CHECK_FLAG in the thin metadata's superblock which: 1) requires the user verify the thin metadata is consistent (e.g. use thin_check, etc) 2) suggests the user verify the thin data is consistent (e.g. use fsck) The only way to clear the superblock's THIN_METADATA_NEEDS_CHECK_FLAG is to run thin_repair. On metadata operation failure: abort current metadata transaction, set pool in read-only mode, and now set the needs_check flag. As part of this change, constraints are introduced or relaxed: * don't allow a pool to transition to write mode if needs_check is set * don't allow data or metadata space to be resized if needs_check is set * if a thin pool's metadata space is exhausted: the kernel will now force the user to take the pool offline for repair before the kernel will allow the metadata space to be extended. Also, update Documentation to include information about when the thin provisioning target commits metadata, how it handles metadata failures and running out of space. Signed-off-by: Mike Snitzer Signed-off-by: Joe Thornber --- drivers/md/dm-thin-metadata.c | 37 ++++++++++++++++++++- drivers/md/dm-thin-metadata.h | 11 +++++++ drivers/md/dm-thin.c | 76 ++++++++++++++++++++++++++++++------------- 3 files changed, 101 insertions(+), 23 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index baa87ff12816..fb9efc829182 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -76,7 +76,7 @@ #define THIN_SUPERBLOCK_MAGIC 27022010 #define THIN_SUPERBLOCK_LOCATION 0 -#define THIN_VERSION 1 +#define THIN_VERSION 2 #define THIN_METADATA_CACHE_SIZE 64 #define SECTOR_TO_BLOCK_SHIFT 3 @@ -1755,3 +1755,38 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, return r; } + +int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) +{ + int r; + struct dm_block *sblock; + struct thin_disk_superblock *disk_super; + + down_write(&pmd->root_lock); + pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG; + + r = superblock_lock(pmd, &sblock); + if (r) { + DMERR("couldn't read superblock"); + goto out; + } + + disk_super = dm_block_data(sblock); + disk_super->flags = cpu_to_le32(pmd->flags); + + dm_bm_unlock(sblock); +out: + up_write(&pmd->root_lock); + return r; +} + +bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd) +{ + bool needs_check; + + down_read(&pmd->root_lock); + needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG; + up_read(&pmd->root_lock); + + return needs_check; +} diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 82ea384d36ff..e3c857db195a 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -25,6 +25,11 @@ /*----------------------------------------------------------------*/ +/* + * Thin metadata superblock flags. + */ +#define THIN_METADATA_NEEDS_CHECK_FLAG (1 << 0) + struct dm_pool_metadata; struct dm_thin_device; @@ -202,6 +207,12 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, dm_sm_threshold_fn fn, void *context); +/* + * Updates the superblock immediately. + */ +int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd); +bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd); + /*----------------------------------------------------------------*/ #endif diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index dfa48ec7b8ea..a04eba905922 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1403,7 +1403,28 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) { int r; struct pool_c *pt = pool->ti->private; - enum pool_mode old_mode = pool->pf.mode; + bool needs_check = dm_pool_metadata_needs_check(pool->pmd); + enum pool_mode old_mode = get_pool_mode(pool); + + /* + * Never allow the pool to transition to PM_WRITE mode if user + * intervention is required to verify metadata and data consistency. + */ + if (new_mode == PM_WRITE && needs_check) { + DMERR("%s: unable to switch pool to write mode until repaired.", + dm_device_name(pool->pool_md)); + if (old_mode != new_mode) + new_mode = old_mode; + else + new_mode = PM_READ_ONLY; + } + /* + * If we were in PM_FAIL mode, rollback of metadata failed. We're + * not going to recover without a thin_repair. So we never let the + * pool move out of the old mode. + */ + if (old_mode == PM_FAIL) + new_mode = old_mode; switch (new_mode) { case PM_FAIL: @@ -1467,19 +1488,28 @@ static void out_of_data_space(struct pool *pool) set_pool_mode(pool, PM_READ_ONLY); } -static void metadata_operation_failed(struct pool *pool, const char *op, int r) +static void abort_transaction(struct pool *pool) { - dm_block_t free_blocks; + const char *dev_name = dm_device_name(pool->pool_md); + + DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); + if (dm_pool_abort_metadata(pool->pmd)) { + DMERR("%s: failed to abort metadata transaction", dev_name); + set_pool_mode(pool, PM_FAIL); + } + + if (dm_pool_metadata_set_needs_check(pool->pmd)) { + DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); + set_pool_mode(pool, PM_FAIL); + } +} +static void metadata_operation_failed(struct pool *pool, const char *op, int r) +{ DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d", dm_device_name(pool->pool_md), op, r); - if (r == -ENOSPC && - !dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) && - !free_blocks) - DMERR_LIMIT("%s: no free metadata space available.", - dm_device_name(pool->pool_md)); - + abort_transaction(pool); set_pool_mode(pool, PM_READ_ONLY); } @@ -1693,7 +1723,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) /* * We want to make sure that a pool in PM_FAIL mode is never upgraded. */ - enum pool_mode old_mode = pool->pf.mode; + enum pool_mode old_mode = get_pool_mode(pool); enum pool_mode new_mode = pt->adjusted_pf.mode; /* @@ -1707,16 +1737,6 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) pool->pf = pt->adjusted_pf; pool->low_water_blocks = pt->low_water_blocks; - /* - * If we were in PM_FAIL mode, rollback of metadata failed. We're - * not going to recover without a thin_repair. So we never let the - * pool move out of the old mode. On the other hand a PM_READ_ONLY - * may have been due to a lack of metadata or data space, and may - * now work (ie. if the underlying devices have been resized). - */ - if (old_mode == PM_FAIL) - new_mode = old_mode; - set_pool_mode(pool, new_mode); return 0; @@ -2259,6 +2279,12 @@ static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit) return -EINVAL; } else if (data_size > sb_data_size) { + if (dm_pool_metadata_needs_check(pool->pmd)) { + DMERR("%s: unable to grow the data device until repaired.", + dm_device_name(pool->pool_md)); + return 0; + } + if (sb_data_size) DMINFO("%s: growing the data device from %llu to %llu blocks", dm_device_name(pool->pool_md), @@ -2300,6 +2326,12 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit) return -EINVAL; } else if (metadata_dev_size > sb_metadata_dev_size) { + if (dm_pool_metadata_needs_check(pool->pmd)) { + DMERR("%s: unable to grow the metadata device until repaired.", + dm_device_name(pool->pool_md)); + return 0; + } + warn_if_metadata_device_too_big(pool->md_dev); DMINFO("%s: growing the metadata device from %llu to %llu blocks", dm_device_name(pool->pool_md), @@ -2801,7 +2833,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 10, 0}, + .version = {1, 11, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -3091,7 +3123,7 @@ static int thin_iterate_devices(struct dm_target *ti, static struct target_type thin_target = { .name = "thin", - .version = {1, 10, 0}, + .version = {1, 11, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, -- cgit v1.2.3 From 3e1a0699095803e53072699a4a1485af7744601d Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 3 Mar 2014 16:03:26 +0000 Subject: dm thin: fix out of data space handling Ideally a thin pool would never run out of data space; the low water mark would trigger userland to extend the pool before we completely run out of space. However, many small random IOs to unprovisioned space can consume data space at an alarming rate. Adjust your low water mark if you're frequently seeing "out-of-data-space" mode. Before this fix, if data space ran out the pool would be put in PM_READ_ONLY mode which also aborted the pool's current metadata transaction (data loss for any changes in the transaction). This had a side-effect of needlessly compromising data consistency. And retry of queued unserviceable bios, once the data pool was resized, could initiate changes to potentially inconsistent pool metadata. Now when the pool's data space is exhausted transition to a new pool mode (PM_OUT_OF_DATA_SPACE) that allows metadata to be changed but data may not be allocated. This allows users to remove thin volumes or discard data to recover data space. The pool is no longer put in PM_READ_ONLY mode in response to the pool running out of data space. And PM_READ_ONLY mode no longer aborts the pool's current metadata transaction. Also, set_pool_mode() will now notify userspace when the pool mode is changed. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 147 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 102 insertions(+), 45 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index a04eba905922..38a063f7afa4 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -130,10 +130,11 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, struct dm_thin_new_mapping; /* - * The pool runs in 3 modes. Ordered in degraded order for comparisons. + * The pool runs in 4 modes. Ordered in degraded order for comparisons. */ enum pool_mode { PM_WRITE, /* metadata may be changed */ + PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */ PM_READ_ONLY, /* metadata may not be changed */ PM_FAIL, /* all I/O fails */ }; @@ -198,7 +199,6 @@ struct pool { }; static enum pool_mode get_pool_mode(struct pool *pool); -static void out_of_data_space(struct pool *pool); static void metadata_operation_failed(struct pool *pool, const char *op, int r); /* @@ -399,6 +399,23 @@ static void requeue_io(struct thin_c *tc) spin_unlock_irqrestore(&pool->lock, flags); } +static void error_retry_list(struct pool *pool) +{ + struct bio *bio; + unsigned long flags; + struct bio_list bios; + + bio_list_init(&bios); + + spin_lock_irqsave(&pool->lock, flags); + bio_list_merge(&bios, &pool->retry_on_resume_list); + bio_list_init(&pool->retry_on_resume_list); + spin_unlock_irqrestore(&pool->lock, flags); + + while ((bio = bio_list_pop(&bios))) + bio_io_error(bio); +} + /* * This section of code contains the logic for processing a thin device's IO. * Much of the code depends on pool object resources (lists, workqueues, etc) @@ -925,13 +942,15 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks) } } +static void set_pool_mode(struct pool *pool, enum pool_mode new_mode); + static int alloc_data_block(struct thin_c *tc, dm_block_t *result) { int r; dm_block_t free_blocks; struct pool *pool = tc->pool; - if (get_pool_mode(pool) != PM_WRITE) + if (WARN_ON(get_pool_mode(pool) != PM_WRITE)) return -EINVAL; r = dm_pool_get_free_block_count(pool->pmd, &free_blocks); @@ -958,7 +977,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) } if (!free_blocks) { - out_of_data_space(pool); + set_pool_mode(pool, PM_OUT_OF_DATA_SPACE); return -ENOSPC; } } @@ -988,15 +1007,32 @@ static void retry_on_resume(struct bio *bio) spin_unlock_irqrestore(&pool->lock, flags); } -static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) +static bool should_error_unserviceable_bio(struct pool *pool) { - /* - * When pool is read-only, no cell locking is needed because - * nothing is changing. - */ - WARN_ON_ONCE(get_pool_mode(pool) != PM_READ_ONLY); + enum pool_mode m = get_pool_mode(pool); + + switch (m) { + case PM_WRITE: + /* Shouldn't get here */ + DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode"); + return true; + + case PM_OUT_OF_DATA_SPACE: + return pool->pf.error_if_no_space; + + case PM_READ_ONLY: + case PM_FAIL: + return true; + default: + /* Shouldn't get here */ + DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode"); + return true; + } +} - if (pool->pf.error_if_no_space) +static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) +{ + if (should_error_unserviceable_bio(pool)) bio_io_error(bio); else retry_on_resume(bio); @@ -1007,11 +1043,20 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c struct bio *bio; struct bio_list bios; + if (should_error_unserviceable_bio(pool)) { + cell_error(pool, cell); + return; + } + bio_list_init(&bios); cell_release(pool, cell, &bios); - while ((bio = bio_list_pop(&bios))) - handle_unserviceable_bio(pool, bio); + if (should_error_unserviceable_bio(pool)) + while ((bio = bio_list_pop(&bios))) + bio_io_error(bio); + else + while ((bio = bio_list_pop(&bios))) + retry_on_resume(bio); } static void process_discard(struct thin_c *tc, struct bio *bio) @@ -1296,6 +1341,11 @@ static void process_bio_read_only(struct thin_c *tc, struct bio *bio) } } +static void process_bio_success(struct thin_c *tc, struct bio *bio) +{ + bio_endio(bio, 0); +} + static void process_bio_fail(struct thin_c *tc, struct bio *bio) { bio_io_error(bio); @@ -1399,9 +1449,15 @@ static enum pool_mode get_pool_mode(struct pool *pool) return pool->pf.mode; } +static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode) +{ + dm_table_event(pool->ti->table); + DMINFO("%s: switching pool to %s mode", + dm_device_name(pool->pool_md), new_mode); +} + static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) { - int r; struct pool_c *pt = pool->ti->private; bool needs_check = dm_pool_metadata_needs_check(pool->pmd); enum pool_mode old_mode = get_pool_mode(pool); @@ -1429,38 +1485,48 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) switch (new_mode) { case PM_FAIL: if (old_mode != new_mode) - DMERR("%s: switching pool to failure mode", - dm_device_name(pool->pool_md)); + notify_of_pool_mode_change(pool, "failure"); dm_pool_metadata_read_only(pool->pmd); pool->process_bio = process_bio_fail; pool->process_discard = process_bio_fail; pool->process_prepared_mapping = process_prepared_mapping_fail; pool->process_prepared_discard = process_prepared_discard_fail; + + error_retry_list(pool); break; case PM_READ_ONLY: if (old_mode != new_mode) - DMERR("%s: switching pool to read-only mode", - dm_device_name(pool->pool_md)); - r = dm_pool_abort_metadata(pool->pmd); - if (r) { - DMERR("%s: aborting transaction failed", - dm_device_name(pool->pool_md)); - new_mode = PM_FAIL; - set_pool_mode(pool, new_mode); - } else { - dm_pool_metadata_read_only(pool->pmd); - pool->process_bio = process_bio_read_only; - pool->process_discard = process_discard; - pool->process_prepared_mapping = process_prepared_mapping_fail; - pool->process_prepared_discard = process_prepared_discard_passdown; - } + notify_of_pool_mode_change(pool, "read-only"); + dm_pool_metadata_read_only(pool->pmd); + pool->process_bio = process_bio_read_only; + pool->process_discard = process_bio_success; + pool->process_prepared_mapping = process_prepared_mapping_fail; + pool->process_prepared_discard = process_prepared_discard_passdown; + + error_retry_list(pool); + break; + + case PM_OUT_OF_DATA_SPACE: + /* + * Ideally we'd never hit this state; the low water mark + * would trigger userland to extend the pool before we + * completely run out of data space. However, many small + * IOs to unprovisioned space can consume data space at an + * alarming rate. Adjust your low water mark if you're + * frequently seeing this mode. + */ + if (old_mode != new_mode) + notify_of_pool_mode_change(pool, "out-of-data-space"); + pool->process_bio = process_bio_read_only; + pool->process_discard = process_discard; + pool->process_prepared_mapping = process_prepared_mapping; + pool->process_prepared_discard = process_prepared_discard_passdown; break; case PM_WRITE: if (old_mode != new_mode) - DMINFO("%s: switching pool to write mode", - dm_device_name(pool->pool_md)); + notify_of_pool_mode_change(pool, "write"); dm_pool_metadata_read_write(pool->pmd); pool->process_bio = process_bio; pool->process_discard = process_discard; @@ -1477,17 +1543,6 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) pt->adjusted_pf.mode = new_mode; } -/* - * Rather than calling set_pool_mode directly, use these which describe the - * reason for mode degradation. - */ -static void out_of_data_space(struct pool *pool) -{ - DMERR_LIMIT("%s: no free data space available.", - dm_device_name(pool->pool_md)); - set_pool_mode(pool, PM_READ_ONLY); -} - static void abort_transaction(struct pool *pool) { const char *dev_name = dm_device_name(pool->pool_md); @@ -2719,7 +2774,9 @@ static void pool_status(struct dm_target *ti, status_type_t type, else DMEMIT("- "); - if (pool->pf.mode == PM_READ_ONLY) + if (pool->pf.mode == PM_OUT_OF_DATA_SPACE) + DMEMIT("out_of_data_space "); + else if (pool->pf.mode == PM_READ_ONLY) DMEMIT("ro "); else DMEMIT("rw "); -- cgit v1.2.3 From 18adc57779866ba451237dccca2ebf019be2fa20 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 3 Mar 2014 15:46:42 +0000 Subject: dm thin: fix deadlock in __requeue_bio_list The spin lock in requeue_io() was held for too long, allowing deadlock. Don't worry, due to other issues addressed in the following "dm thin: fix noflush suspend IO queueing" commit, this code was never called. Fix this by taking the spin lock for a much shorter period of time. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 38a063f7afa4..f39876dd307d 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -369,14 +369,18 @@ struct dm_thin_endio_hook { struct dm_thin_new_mapping *overwrite_mapping; }; -static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) +static void requeue_bio_list(struct thin_c *tc, struct bio_list *master) { struct bio *bio; struct bio_list bios; + unsigned long flags; bio_list_init(&bios); + + spin_lock_irqsave(&tc->pool->lock, flags); bio_list_merge(&bios, master); bio_list_init(master); + spin_unlock_irqrestore(&tc->pool->lock, flags); while ((bio = bio_list_pop(&bios))) { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); @@ -391,12 +395,9 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) static void requeue_io(struct thin_c *tc) { struct pool *pool = tc->pool; - unsigned long flags; - spin_lock_irqsave(&pool->lock, flags); - __requeue_bio_list(tc, &pool->deferred_bios); - __requeue_bio_list(tc, &pool->retry_on_resume_list); - spin_unlock_irqrestore(&pool->lock, flags); + requeue_bio_list(tc, &pool->deferred_bios); + requeue_bio_list(tc, &pool->retry_on_resume_list); } static void error_retry_list(struct pool *pool) -- cgit v1.2.3 From 738211f70a1d0c7ff7dc965395f7b8a436365ebd Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Mon, 3 Mar 2014 15:52:28 +0000 Subject: dm thin: fix noflush suspend IO queueing i) by the time DM core calls the postsuspend hook the dm_noflush flag has been cleared. So the old thin_postsuspend did nothing. We need to use the presuspend hook instead. ii) There was a race between bios leaving DM core and arriving in the deferred queue. thin_presuspend now sets a 'requeue' flag causing all bios destined for that thin to be requeued back to DM core. Then it requeues all held IO, and all IO on the deferred queue (destined for that thin). Finally postsuspend clears the 'requeue' flag. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index f39876dd307d..be70d38745f7 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -226,6 +226,7 @@ struct thin_c { struct pool *pool; struct dm_thin_device *td; + bool requeue_mode:1; }; /*----------------------------------------------------------------*/ @@ -1379,6 +1380,11 @@ static void process_deferred_bios(struct pool *pool) struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); struct thin_c *tc = h->tc; + if (tc->requeue_mode) { + bio_endio(bio, DM_ENDIO_REQUEUE); + continue; + } + /* * If we've got no free new_mapping structs, and processing * this bio might require one, we pause until there are some @@ -1445,6 +1451,51 @@ static void do_waker(struct work_struct *ws) /*----------------------------------------------------------------*/ +struct noflush_work { + struct work_struct worker; + struct thin_c *tc; + + atomic_t complete; + wait_queue_head_t wait; +}; + +static void complete_noflush_work(struct noflush_work *w) +{ + atomic_set(&w->complete, 1); + wake_up(&w->wait); +} + +static void do_noflush_start(struct work_struct *ws) +{ + struct noflush_work *w = container_of(ws, struct noflush_work, worker); + w->tc->requeue_mode = true; + requeue_io(w->tc); + complete_noflush_work(w); +} + +static void do_noflush_stop(struct work_struct *ws) +{ + struct noflush_work *w = container_of(ws, struct noflush_work, worker); + w->tc->requeue_mode = false; + complete_noflush_work(w); +} + +static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *)) +{ + struct noflush_work w; + + INIT_WORK(&w.worker, fn); + w.tc = tc; + atomic_set(&w.complete, 0); + init_waitqueue_head(&w.wait); + + queue_work(tc->pool->wq, &w.worker); + + wait_event(w.wait, atomic_read(&w.complete)); +} + +/*----------------------------------------------------------------*/ + static enum pool_mode get_pool_mode(struct pool *pool) { return pool->pf.mode; @@ -1616,6 +1667,11 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) thin_hook_bio(tc, bio); + if (tc->requeue_mode) { + bio_endio(bio, DM_ENDIO_REQUEUE); + return DM_MAPIO_SUBMITTED; + } + if (get_pool_mode(tc->pool) == PM_FAIL) { bio_io_error(bio); return DM_MAPIO_SUBMITTED; @@ -3093,10 +3149,23 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err) return 0; } -static void thin_postsuspend(struct dm_target *ti) +static void thin_presuspend(struct dm_target *ti) { + struct thin_c *tc = ti->private; + if (dm_noflush_suspending(ti)) - requeue_io((struct thin_c *)ti->private); + noflush_work(tc, do_noflush_start); +} + +static void thin_postsuspend(struct dm_target *ti) +{ + struct thin_c *tc = ti->private; + + /* + * The dm_noflush_suspending flag has been cleared by now, so + * unfortunately we must always run this. + */ + noflush_work(tc, do_noflush_stop); } /* @@ -3187,6 +3256,7 @@ static struct target_type thin_target = { .dtr = thin_dtr, .map = thin_map, .end_io = thin_endio, + .presuspend = thin_presuspend, .postsuspend = thin_postsuspend, .status = thin_status, .iterate_devices = thin_iterate_devices, -- cgit v1.2.3 From cebc2de44d3bce53e46476e774126c298ca2c8a9 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Fri, 7 Mar 2014 14:57:19 +0000 Subject: dm space map metadata: fix refcount decrement below 0 which caused corruption This has been a relatively long-standing issue that wasn't nailed down until Teng-Feng Yang's meticulous bug report to dm-devel on 3/7/2014, see: http://www.redhat.com/archives/dm-devel/2014-March/msg00021.html From that report: "When decreasing the reference count of a metadata block with its reference count equals 3, we will call dm_btree_remove() to remove this enrty from the B+tree which keeps the reference count info in metadata device. The B+tree will try to rebalance the entry of the child nodes in each node it traversed, and the rebalance process contains the following steps. (1) Finding the corresponding children in current node (shadow_current(s)) (2) Shadow the children block (issue BOP_INC) (3) redistribute keys among children, and free children if necessary (issue BOP_DEC) Since the update of a metadata block's reference count could be recursive, we will stash these reference count update operations in smm->uncommitted and then process them in a FILO fashion. The problem is that step(3) could free the children which is created in step(2), so the BOP_DEC issued in step(3) will be carried out before the BOP_INC issued in step(2) since these BOPs will be processed in FILO fashion. Once the BOP_DEC from step(3) tries to decrease the reference count of newly shadow block, it will report failure for its reference equals 0 before decreasing. It looks like we can solve this issue by processing these BOPs in a FIFO fashion instead of FILO." Commit 5b564d80 ("dm space map: disallow decrementing a reference count below zero") changed the code to report an error for this temporary refcount decrement below zero. So what was previously a harmless invalid refcount became a hard failure due to the new error path: device-mapper: space map common: unable to decrement a reference count below 0 device-mapper: thin: 253:6: dm_thin_insert_block() failed: error = -22 device-mapper: thin: 253:6: switching pool to read-only mode This bug is in dm persistent-data code that is common to the DM thin and cache targets. So any users of those targets should apply this fix. Fix this by applying recursive space map operations in FIFO order rather than FILO. Resolves: https://bugzilla.kernel.org/show_bug.cgi?id=68801 Reported-by: Apollon Oikonomopoulos Reported-by: edwillam1007@gmail.com Reported-by: Teng-Feng Yang Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.13+ --- drivers/md/persistent-data/dm-space-map-metadata.c | 113 +++++++++++++++++---- 1 file changed, 92 insertions(+), 21 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index e9bdd462f4f5..786b689bdfc7 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -91,6 +91,69 @@ struct block_op { dm_block_t block; }; +struct bop_ring_buffer { + unsigned begin; + unsigned end; + struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1]; +}; + +static void brb_init(struct bop_ring_buffer *brb) +{ + brb->begin = 0; + brb->end = 0; +} + +static bool brb_empty(struct bop_ring_buffer *brb) +{ + return brb->begin == brb->end; +} + +static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old) +{ + unsigned r = old + 1; + return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r; +} + +static int brb_push(struct bop_ring_buffer *brb, + enum block_op_type type, dm_block_t b) +{ + struct block_op *bop; + unsigned next = brb_next(brb, brb->end); + + /* + * We don't allow the last bop to be filled, this way we can + * differentiate between full and empty. + */ + if (next == brb->begin) + return -ENOMEM; + + bop = brb->bops + brb->end; + bop->type = type; + bop->block = b; + + brb->end = next; + + return 0; +} + +static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result) +{ + struct block_op *bop; + + if (brb_empty(brb)) + return -ENODATA; + + bop = brb->bops + brb->begin; + result->type = bop->type; + result->block = bop->block; + + brb->begin = brb_next(brb, brb->begin); + + return 0; +} + +/*----------------------------------------------------------------*/ + struct sm_metadata { struct dm_space_map sm; @@ -101,25 +164,20 @@ struct sm_metadata { unsigned recursion_count; unsigned allocated_this_transaction; - unsigned nr_uncommitted; - struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS]; + struct bop_ring_buffer uncommitted; struct threshold threshold; }; static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) { - struct block_op *op; + int r = brb_push(&smm->uncommitted, type, b); - if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) { + if (r) { DMERR("too many recursive allocations"); return -ENOMEM; } - op = smm->uncommitted + smm->nr_uncommitted++; - op->type = type; - op->block = b; - return 0; } @@ -158,11 +216,17 @@ static int out(struct sm_metadata *smm) return -ENOMEM; } - if (smm->recursion_count == 1 && smm->nr_uncommitted) { - while (smm->nr_uncommitted && !r) { - smm->nr_uncommitted--; - r = commit_bop(smm, smm->uncommitted + - smm->nr_uncommitted); + if (smm->recursion_count == 1) { + while (!brb_empty(&smm->uncommitted)) { + struct block_op bop; + + r = brb_pop(&smm->uncommitted, &bop); + if (r) { + DMERR("bug in bop ring buffer"); + break; + } + + r = commit_bop(smm, &bop); if (r) break; } @@ -217,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count) static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, uint32_t *result) { - int r, i; + int r; + unsigned i; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); unsigned adjustment = 0; @@ -225,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, * We may have some uncommitted adjustments to add. This list * should always be really short. */ - for (i = 0; i < smm->nr_uncommitted; i++) { - struct block_op *op = smm->uncommitted + i; + for (i = smm->uncommitted.begin; + i != smm->uncommitted.end; + i = brb_next(&smm->uncommitted, i)) { + struct block_op *op = smm->uncommitted.bops + i; if (op->block != b) continue; @@ -254,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b, int *result) { - int r, i, adjustment = 0; + int r, adjustment = 0; + unsigned i; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); uint32_t rc; @@ -262,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, * We may have some uncommitted adjustments to add. This list * should always be really short. */ - for (i = 0; i < smm->nr_uncommitted; i++) { - struct block_op *op = smm->uncommitted + i; + for (i = smm->uncommitted.begin; + i != smm->uncommitted.end; + i = brb_next(&smm->uncommitted, i)) { + + struct block_op *op = smm->uncommitted.bops + i; if (op->block != b) continue; @@ -671,7 +742,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm, smm->begin = superblock + 1; smm->recursion_count = 0; smm->allocated_this_transaction = 0; - smm->nr_uncommitted = 0; + brb_init(&smm->uncommitted); threshold_init(&smm->threshold); memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); @@ -715,7 +786,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm, smm->begin = 0; smm->recursion_count = 0; smm->allocated_this_transaction = 0; - smm->nr_uncommitted = 0; + brb_init(&smm->uncommitted); threshold_init(&smm->threshold); memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); -- cgit v1.2.3