diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-03-05 17:22:08 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-03-05 17:22:08 -0800 |
commit | a5e0d73163a848060ac0c2c054274e84a654986e (patch) | |
tree | 708ab200af3653090d3cad95228fae21aa6c52cd | |
parent | 6dbe51c251a327e012439c4772097a13df43c5b8 (diff) | |
parent | f3378b48705154b9089affb2d2e939622aea68f1 (diff) |
Merge tag 'md-3.9' of git://neil.brown.name/md
Pull md updates from NeilBrown:
"Mostly little bugfixes.
Only "feature" is a new RAID10 layout which slightly improves the
number of sets of devices that can concurrently fail, without data
loss."
* tag 'md-3.9' of git://neil.brown.name/md:
md: expedite metadata update when switching read-auto -> active
md: remove CONFIG_MULTICORE_RAID456
md/raid1,raid10: fix deadlock with freeze_array()
md/raid0: improve error message when converting RAID4-with-spares to RAID0
md: raid0: fix error return from create_stripe_zones.
md: fix two bugs when attempting to resize RAID0 array.
DM RAID: Add support for MD's RAID10 "far" and "offset" algorithms
MD RAID10: Improve redundancy for 'far' and 'offset' algorithms (part 2)
MD RAID10: Improve redundancy for 'far' and 'offset' algorithms (part 1)
MD RAID10: Minor non-functional code changes
md: raid1,10: Handle REQ_WRITE_SAME flag in write bios
md: protect against crash upon fsync on ro array
-rw-r--r-- | Documentation/device-mapper/dm-raid.txt | 44 | ||||
-rw-r--r-- | drivers/md/Kconfig | 11 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 123 | ||||
-rw-r--r-- | drivers/md/md.c | 19 | ||||
-rw-r--r-- | drivers/md/raid0.c | 13 | ||||
-rw-r--r-- | drivers/md/raid1.c | 8 | ||||
-rw-r--r-- | drivers/md/raid10.c | 97 | ||||
-rw-r--r-- | drivers/md/raid10.h | 5 | ||||
-rw-r--r-- | drivers/md/raid5.c | 38 |
9 files changed, 256 insertions, 102 deletions
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index 56fb62b09fc5..b428556197c9 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -30,6 +30,7 @@ The target is named "raid" and it accepts the following parameters: raid10 Various RAID10 inspired algorithms chosen by additional params - RAID10: Striped Mirrors (aka 'Striping on top of mirrors') - RAID1E: Integrated Adjacent Stripe Mirroring + - RAID1E: Integrated Offset Stripe Mirroring - and other similar RAID10 variants Reference: Chapter 4 of @@ -64,15 +65,15 @@ The target is named "raid" and it accepts the following parameters: synchronisation state for each region. [raid10_copies <# copies>] - [raid10_format near] + [raid10_format <near|far|offset>] These two options are used to alter the default layout of a RAID10 configuration. The number of copies is can be - specified, but the default is 2. There are other variations - to how the copies are laid down - the default and only current - option is "near". Near copies are what most people think of - with respect to mirroring. If these options are left - unspecified, or 'raid10_copies 2' and/or 'raid10_format near' - are given, then the layouts for 2, 3 and 4 devices are: + specified, but the default is 2. There are also three + variations to how the copies are laid down - the default + is "near". Near copies are what most people think of with + respect to mirroring. If these options are left unspecified, + or 'raid10_copies 2' and/or 'raid10_format near' are given, + then the layouts for 2, 3 and 4 devices are: 2 drives 3 drives 4 drives -------- ---------- -------------- A1 A1 A1 A1 A2 A1 A1 A2 A2 @@ -85,6 +86,33 @@ The target is named "raid" and it accepts the following parameters: 3-device layout is what might be called a 'RAID1E - Integrated Adjacent Stripe Mirroring'. + If 'raid10_copies 2' and 'raid10_format far', then the layouts + for 2, 3 and 4 devices are: + 2 drives 3 drives 4 drives + -------- -------------- -------------------- + A1 A2 A1 A2 A3 A1 A2 A3 A4 + A3 A4 A4 A5 A6 A5 A6 A7 A8 + A5 A6 A7 A8 A9 A9 A10 A11 A12 + .. .. .. .. .. .. .. .. .. + A2 A1 A3 A1 A2 A2 A1 A4 A3 + A4 A3 A6 A4 A5 A6 A5 A8 A7 + A6 A5 A9 A7 A8 A10 A9 A12 A11 + .. .. .. .. .. .. .. .. .. + + If 'raid10_copies 2' and 'raid10_format offset', then the + layouts for 2, 3 and 4 devices are: + 2 drives 3 drives 4 drives + -------- ------------ ----------------- + A1 A2 A1 A2 A3 A1 A2 A3 A4 + A2 A1 A3 A1 A2 A2 A1 A4 A3 + A3 A4 A4 A5 A6 A5 A6 A7 A8 + A4 A3 A6 A4 A5 A6 A5 A8 A7 + A5 A6 A7 A8 A9 A9 A10 A11 A12 + A6 A5 A9 A7 A8 A10 A9 A12 A11 + .. .. .. .. .. .. .. .. .. + Here we see layouts closely akin to 'RAID1E - Integrated + Offset Stripe Mirroring'. + <#raid_devs>: The number of devices composing the array. Each device consists of two entries. The first is the device containing the metadata (if any); the second is the one containing the @@ -142,3 +170,5 @@ Version History 1.3.0 Added support for RAID 10 1.3.1 Allow device replacement/rebuild for RAID 10 1.3.2 Fix/improve redundancy checking for RAID10 +1.4.0 Non-functional change. Removes arg from mapping function. +1.4.1 Add RAID10 "far" and "offset" algorithm support. diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index e30b490055aa..4d8d90b4fe78 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -154,17 +154,6 @@ config MD_RAID456 If unsure, say Y. -config MULTICORE_RAID456 - bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)" - depends on MD_RAID456 - depends on SMP - depends on EXPERIMENTAL - ---help--- - Enable the raid456 module to dispatch per-stripe raid operations to a - thread pool. - - If unsure, say N. - config MD_MULTIPATH tristate "Multipath I/O support" depends on BLK_DEV_MD diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 9a01d1e4c783..311e3d35b272 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -91,15 +91,44 @@ static struct raid_type { {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} }; +static char *raid10_md_layout_to_format(int layout) +{ + /* + * Bit 16 and 17 stand for "offset" and "use_far_sets" + * Refer to MD's raid10.c for details + */ + if ((layout & 0x10000) && (layout & 0x20000)) + return "offset"; + + if ((layout & 0xFF) > 1) + return "near"; + + return "far"; +} + static unsigned raid10_md_layout_to_copies(int layout) { - return layout & 0xFF; + if ((layout & 0xFF) > 1) + return layout & 0xFF; + return (layout >> 8) & 0xFF; } static int raid10_format_to_md_layout(char *format, unsigned copies) { - /* 1 "far" copy, and 'copies' "near" copies */ - return (1 << 8) | (copies & 0xFF); + unsigned n = 1, f = 1; + + if (!strcmp("near", format)) + n = copies; + else + f = copies; + + if (!strcmp("offset", format)) + return 0x30000 | (f << 8) | n; + + if (!strcmp("far", format)) + return 0x20000 | (f << 8) | n; + + return (f << 8) | n; } static struct raid_type *get_raid_type(char *name) @@ -352,6 +381,7 @@ static int validate_raid_redundancy(struct raid_set *rs) { unsigned i, rebuild_cnt = 0; unsigned rebuilds_per_group, copies, d; + unsigned group_size, last_group_start; for (i = 0; i < rs->md.raid_disks; i++) if (!test_bit(In_sync, &rs->dev[i].rdev.flags) || @@ -379,9 +409,6 @@ static int validate_raid_redundancy(struct raid_set *rs) * as long as the failed devices occur in different mirror * groups (i.e. different stripes). * - * Right now, we only allow for "near" copies. When other - * formats are added, we will have to check those too. - * * When checking "near" format, make sure no adjacent devices * have failed beyond what can be handled. In addition to the * simple case where the number of devices is a multiple of the @@ -391,14 +418,41 @@ static int validate_raid_redundancy(struct raid_set *rs) * A A B B C * C D D E E */ - for (i = 0; i < rs->md.raid_disks * copies; i++) { - if (!(i % copies)) + if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) { + for (i = 0; i < rs->md.raid_disks * copies; i++) { + if (!(i % copies)) + rebuilds_per_group = 0; + d = i % rs->md.raid_disks; + if ((!rs->dev[d].rdev.sb_page || + !test_bit(In_sync, &rs->dev[d].rdev.flags)) && + (++rebuilds_per_group >= copies)) + goto too_many; + } + break; + } + + /* + * When checking "far" and "offset" formats, we need to ensure + * that the device that holds its copy is not also dead or + * being rebuilt. (Note that "far" and "offset" formats only + * support two copies right now. These formats also only ever + * use the 'use_far_sets' variant.) + * + * This check is somewhat complicated by the need to account + * for arrays that are not a multiple of (far) copies. This + * results in the need to treat the last (potentially larger) + * set differently. + */ + group_size = (rs->md.raid_disks / copies); + last_group_start = (rs->md.raid_disks / group_size) - 1; + last_group_start *= group_size; + for (i = 0; i < rs->md.raid_disks; i++) { + if (!(i % copies) && !(i > last_group_start)) rebuilds_per_group = 0; - d = i % rs->md.raid_disks; - if ((!rs->dev[d].rdev.sb_page || - !test_bit(In_sync, &rs->dev[d].rdev.flags)) && + if ((!rs->dev[i].rdev.sb_page || + !test_bit(In_sync, &rs->dev[i].rdev.flags)) && (++rebuilds_per_group >= copies)) - goto too_many; + goto too_many; } break; default: @@ -433,7 +487,7 @@ too_many: * * RAID10-only options: * [raid10_copies <# copies>] Number of copies. (Default: 2) - * [raid10_format <near>] Layout algorithm. (Default: near) + * [raid10_format <near|far|offset>] Layout algorithm. (Default: near) */ static int parse_raid_params(struct raid_set *rs, char **argv, unsigned num_raid_params) @@ -520,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv, rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type"; return -EINVAL; } - if (strcmp("near", argv[i])) { + if (strcmp("near", argv[i]) && + strcmp("far", argv[i]) && + strcmp("offset", argv[i])) { rs->ti->error = "Invalid 'raid10_format' value given"; return -EINVAL; } @@ -644,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv, return -EINVAL; } + /* + * If the format is not "near", we only support + * two copies at the moment. + */ + if (strcmp("near", raid10_format) && (raid10_copies > 2)) { + rs->ti->error = "Too many copies for given RAID10 format."; + return -EINVAL; + } + /* (Len * #mirrors) / #devices */ sectors_per_dev = rs->ti->len * raid10_copies; sector_div(sectors_per_dev, rs->md.raid_disks); @@ -854,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) /* * Reshaping is not currently allowed */ - if ((le32_to_cpu(sb->level) != mddev->level) || - (le32_to_cpu(sb->layout) != mddev->layout) || - (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { - DMERR("Reshaping arrays not yet supported."); + if (le32_to_cpu(sb->level) != mddev->level) { + DMERR("Reshaping arrays not yet supported. (RAID level change)"); + return -EINVAL; + } + if (le32_to_cpu(sb->layout) != mddev->layout) { + DMERR("Reshaping arrays not yet supported. (RAID layout change)"); + DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout); + DMERR(" Old layout: %s w/ %d copies", + raid10_md_layout_to_format(le32_to_cpu(sb->layout)), + raid10_md_layout_to_copies(le32_to_cpu(sb->layout))); + DMERR(" New layout: %s w/ %d copies", + raid10_md_layout_to_format(mddev->layout), + raid10_md_layout_to_copies(mddev->layout)); + return -EINVAL; + } + if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) { + DMERR("Reshaping arrays not yet supported. (stripe sectors change)"); return -EINVAL; } /* We can only change the number of devices in RAID1 right now */ if ((rs->raid_type->level != 1) && (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { - DMERR("Reshaping arrays not yet supported."); + DMERR("Reshaping arrays not yet supported. (device count change)"); return -EINVAL; } @@ -1329,7 +1407,8 @@ static void raid_status(struct dm_target *ti, status_type_t type, raid10_md_layout_to_copies(rs->md.layout)); if (rs->print_flags & DMPF_RAID10_FORMAT) - DMEMIT(" raid10_format near"); + DMEMIT(" raid10_format %s", + raid10_md_layout_to_format(rs->md.layout)); DMEMIT(" %d", rs->md.raid_disks); for (i = 0; i < rs->md.raid_disks; i++) { @@ -1418,6 +1497,10 @@ static struct target_type raid_target = { static int __init dm_raid_init(void) { + DMINFO("Loading target version %u.%u.%u", + raid_target.version[0], + raid_target.version[1], + raid_target.version[2]); return dm_register_target(&raid_target); } diff --git a/drivers/md/md.c b/drivers/md/md.c index 3db3d1b271f7..fcb878f88796 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -307,6 +307,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio) bio_io_error(bio); return; } + if (mddev->ro == 1 && unlikely(rw == WRITE)) { + bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS); + return; + } smp_rmb(); /* Ensure implications of 'active' are visible */ rcu_read_lock(); if (mddev->suspended) { @@ -2994,6 +2998,9 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) } else if (!sectors) sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - rdev->data_offset; + if (!my_mddev->pers->resize) + /* Cannot change size for RAID0 or Linear etc */ + return -EINVAL; } if (sectors < my_mddev->dev_sectors) return -EINVAL; /* component must fit device */ @@ -6525,7 +6532,17 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, mddev->ro = 0; sysfs_notify_dirent_safe(mddev->sysfs_state); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); + /* mddev_unlock will wake thread */ + /* If a device failed while we were read-only, we + * need to make sure the metadata is updated now. + */ + if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { + mddev_unlock(mddev); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_DEVS, &mddev->flags) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)); + mddev_lock(mddev); + } } else { err = -EROFS; goto abort_unlock; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 24b359717a7e..0505452de8d6 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -175,7 +175,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) rdev1->new_raid_disk = j; } - if (j < 0 || j >= mddev->raid_disks) { + if (j < 0) { + printk(KERN_ERR + "md/raid0:%s: remove inactive devices before converting to RAID0\n", + mdname(mddev)); + goto abort; + } + if (j >= mddev->raid_disks) { printk(KERN_ERR "md/raid0:%s: bad disk number %d - " "aborting!\n", mdname(mddev), j); goto abort; @@ -289,7 +295,7 @@ abort: kfree(conf->strip_zone); kfree(conf->devlist); kfree(conf); - *private_conf = NULL; + *private_conf = ERR_PTR(err); return err; } @@ -411,7 +417,8 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks "%s does not support generic reshape\n", __func__); rdev_for_each(rdev, mddev) - array_sectors += rdev->sectors; + array_sectors += (rdev->sectors & + ~(sector_t)(mddev->chunk_sectors-1)); return array_sectors; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d5bddfc4010e..fd86b372692d 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -967,6 +967,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) bio_list_merge(&conf->pending_bio_list, &plug->pending); conf->pending_count += plug->pending_cnt; spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_barrier); md_wakeup_thread(mddev->thread); kfree(plug); return; @@ -1000,6 +1001,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); const unsigned long do_discard = (bio->bi_rw & (REQ_DISCARD | REQ_SECURE)); + const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME); struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; struct raid1_plug_cb *plug = NULL; @@ -1301,7 +1303,8 @@ read_again: conf->mirrors[i].rdev->data_offset); mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; - mbio->bi_rw = WRITE | do_flush_fua | do_sync | do_discard; + mbio->bi_rw = + WRITE | do_flush_fua | do_sync | do_discard | do_same; mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); @@ -2818,6 +2821,9 @@ static int run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); + if (mddev->queue) + blk_queue_max_write_same_sectors(mddev->queue, + mddev->chunk_sectors); rdev_for_each(rdev, mddev) { if (!mddev->gendisk) continue; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 64d48249c03b..77b562d18a90 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -38,21 +38,36 @@ * near_copies (stored in low byte of layout) * far_copies (stored in second byte of layout) * far_offset (stored in bit 16 of layout ) + * use_far_sets (stored in bit 17 of layout ) * - * The data to be stored is divided into chunks using chunksize. - * Each device is divided into far_copies sections. - * In each section, chunks are laid out in a style similar to raid0, but - * near_copies copies of each chunk is stored (each on a different drive). - * The starting device for each section is offset near_copies from the starting - * device of the previous section. - * Thus they are (near_copies*far_copies) of each chunk, and each is on a different - * drive. - * near_copies and far_copies must be at least one, and their product is at most - * raid_disks. + * The data to be stored is divided into chunks using chunksize. Each device + * is divided into far_copies sections. In each section, chunks are laid out + * in a style similar to raid0, but near_copies copies of each chunk is stored + * (each on a different drive). The starting device for each section is offset + * near_copies from the starting device of the previous section. Thus there + * are (near_copies * far_copies) of each chunk, and each is on a different + * drive. near_copies and far_copies must be at least one, and their product + * is at most raid_disks. * * If far_offset is true, then the far_copies are handled a bit differently. - * The copies are still in different stripes, but instead of be very far apart - * on disk, there are adjacent stripes. + * The copies are still in different stripes, but instead of being very far + * apart on disk, there are adjacent stripes. + * + * The far and offset algorithms are handled slightly differently if + * 'use_far_sets' is true. In this case, the array's devices are grouped into + * sets that are (near_copies * far_copies) in size. The far copied stripes + * are still shifted by 'near_copies' devices, but this shifting stays confined + * to the set rather than the entire array. This is done to improve the number + * of device combinations that can fail without causing the array to fail. + * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk + * on a device): + * A B C D A B C D E + * ... ... + * D A B C E A B C D + * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s): + * [A B] [C D] [A B] [C D E] + * |...| |...| |...| | ... | + * [B A] [D C] [B A] [E C D] */ /* @@ -535,6 +550,13 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) sector_t stripe; int dev; int slot = 0; + int last_far_set_start, last_far_set_size; + + last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; + last_far_set_start *= geo->far_set_size; + + last_far_set_size = geo->far_set_size; + last_far_set_size += (geo->raid_disks % geo->far_set_size); /* now calculate first sector/dev */ chunk = r10bio->sector >> geo->chunk_shift; @@ -551,15 +573,25 @@ static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) /* and calculate all the others */ for (n = 0; n < geo->near_copies; n++) { int d = dev; + int set; sector_t s = sector; - r10bio->devs[slot].addr = sector; r10bio->devs[slot].devnum = d; + r10bio->devs[slot].addr = s; slot++; for (f = 1; f < geo->far_copies; f++) { + set = d / geo->far_set_size; d += geo->near_copies; - if (d >= geo->raid_disks) - d -= geo->raid_disks; + + if ((geo->raid_disks % geo->far_set_size) && + (d > last_far_set_start)) { + d -= last_far_set_start; + d %= last_far_set_size; + d += last_far_set_start; + } else { + d %= geo->far_set_size; + d += geo->far_set_size * set; + } s += geo->stride; r10bio->devs[slot].devnum = d; r10bio->devs[slot].addr = s; @@ -595,6 +627,20 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) * or recovery, so reshape isn't happening */ struct geom *geo = &conf->geo; + int far_set_start = (dev / geo->far_set_size) * geo->far_set_size; + int far_set_size = geo->far_set_size; + int last_far_set_start; + + if (geo->raid_disks % geo->far_set_size) { + last_far_set_start = (geo->raid_disks / geo->far_set_size) - 1; + last_far_set_start *= geo->far_set_size; + + if (dev >= last_far_set_start) { + far_set_size = geo->far_set_size; + far_set_size += (geo->raid_disks % geo->far_set_size); + far_set_start = last_far_set_start; + } + } offset = sector & geo->chunk_mask; if (geo->far_offset) { @@ -602,13 +648,13 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) chunk = sector >> geo->chunk_shift; fc = sector_div(chunk, geo->far_copies); dev -= fc * geo->near_copies; - if (dev < 0) - dev += geo->raid_disks; + if (dev < far_set_start) + dev += far_set_size; } else { while (sector >= geo->stride) { sector -= geo->stride; - if (dev < geo->near_copies) - dev += geo->raid_disks - geo->near_copies; + if (dev < (geo->near_copies + far_set_start)) + dev += far_set_size - geo->near_copies; else dev -= geo->near_copies; } @@ -1073,6 +1119,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) bio_list_merge(&conf->pending_bio_list, &plug->pending); conf->pending_count += plug->pending_cnt; spin_unlock_irq(&conf->device_lock); + wake_up(&conf->wait_barrier); md_wakeup_thread(mddev->thread); kfree(plug); return; @@ -1105,6 +1152,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_fua = (bio->bi_rw & REQ_FUA); const unsigned long do_discard = (bio->bi_rw & (REQ_DISCARD | REQ_SECURE)); + const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME); unsigned long flags; struct md_rdev *blocked_rdev; struct blk_plug_cb *cb; @@ -1460,7 +1508,8 @@ retry_write: rdev)); mbio->bi_bdev = rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; + mbio->bi_rw = + WRITE | do_sync | do_fua | do_discard | do_same; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1502,7 +1551,8 @@ retry_write: r10_bio, rdev)); mbio->bi_bdev = rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; + mbio->bi_rw = + WRITE | do_sync | do_fua | do_discard | do_same; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -3436,7 +3486,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) disks = mddev->raid_disks + mddev->delta_disks; break; } - if (layout >> 17) + if (layout >> 18) return -1; if (chunk < (PAGE_SIZE >> 9) || !is_power_of_2(chunk)) @@ -3448,6 +3498,7 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) geo->near_copies = nc; geo->far_copies = fc; geo->far_offset = fo; + geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks; geo->chunk_mask = chunk - 1; geo->chunk_shift = ffz(~chunk); return nc*fc; @@ -3569,6 +3620,8 @@ static int run(struct mddev *mddev) if (mddev->queue) { blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_write_same_sectors(mddev->queue, + mddev->chunk_sectors); blk_queue_io_min(mddev->queue, chunk_size); if (conf->geo.raid_disks % conf->geo.near_copies) blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 1054cf602345..157d69e83ff4 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -33,6 +33,11 @@ struct r10conf { * far_offset, in which case it is * 1 stripe. */ + int far_set_size; /* The number of devices in a set, + * where a 'set' are devices that + * contain far/offset copies of + * each other. + */ int chunk_shift; /* shift from chunks to sectors */ sector_t chunk_mask; } prev, geo; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5af2d2709081..3ee2912889e7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1403,7 +1403,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu &sh->ops.zero_sum_result, percpu->spare_page, &submit); } -static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) +static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) { int overlap_clear = 0, i, disks = sh->disks; struct dma_async_tx_descriptor *tx = NULL; @@ -1468,36 +1468,6 @@ static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) put_cpu(); } -#ifdef CONFIG_MULTICORE_RAID456 -static void async_run_ops(void *param, async_cookie_t cookie) -{ - struct stripe_head *sh = param; - unsigned long ops_request = sh->ops.request; - - clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); - wake_up(&sh->ops.wait_for_ops); - - __raid_run_ops(sh, ops_request); - release_stripe(sh); -} - -static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) -{ - /* since handle_stripe can be called outside of raid5d context - * we need to ensure sh->ops.request is de-staged before another - * request arrives - */ - wait_event(sh->ops.wait_for_ops, - !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); - sh->ops.request = ops_request; - - atomic_inc(&sh->count); - async_schedule(async_run_ops, sh); -} -#else -#define raid_run_ops __raid_run_ops -#endif - static int grow_one_stripe(struct r5conf *conf) { struct stripe_head *sh; @@ -1506,9 +1476,6 @@ static int grow_one_stripe(struct r5conf *conf) return 0; sh->raid_conf = conf; - #ifdef CONFIG_MULTICORE_RAID456 - init_waitqueue_head(&sh->ops.wait_for_ops); - #endif spin_lock_init(&sh->stripe_lock); @@ -1627,9 +1594,6 @@ static int resize_stripes(struct r5conf *conf, int newsize) break; nsh->raid_conf = conf; - #ifdef CONFIG_MULTICORE_RAID456 - init_waitqueue_head(&nsh->ops.wait_for_ops); - #endif spin_lock_init(&nsh->stripe_lock); list_add(&nsh->lru, &newstripes); |