From b55e6bfcd23cb2f7249095050c649f7aea813f9f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:06 -0800 Subject: [PATCH] md: Split disks array out of raid5 conf structure so it is easier to grow The remainder of this batch implements raid5 reshaping. Currently the only shape change that is supported is added a device, but it is envisioned that changing the chunksize and layout will also be supported, as well as changing the level (e.g. 1->5, 5->6). The reshape process naturally has to move all of the data in the array, and so should be used with caution. It is believed to work, and some testing does support this, but wider testing would be great for increasing my confidence. You will need a version of mdadm newer than 2.3.1 to make use of raid5 growth. This is because mdadm need to take a copy of a 'critical section' at the start of the array incase there is a crash at an awkward moment. On restart, mdadm will restore the critical section and allow reshape to continue. I hope to release a 2.4-pre by early next week - it still needs a little more polishing. This patch: Previously the array of disk information was included in the raid5 'conf' structure which was allocated to an appropriate size. This makes it awkward to change the size of that array. So we split it off into a separate kmalloced array which will require a little extra indexing, but is much easier to grow. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/raid5.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 394da8207b34..94dbdd406f12 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -240,7 +240,7 @@ struct raid5_private_data { * waiting for 25% to be free */ spinlock_t device_lock; - struct disk_info disks[0]; + struct disk_info *disks; }; typedef struct raid5_private_data raid5_conf_t; -- cgit v1.2.3 From ad01c9e3752f4ba4f3d99c89b7370fa4983a25b5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:07 -0800 Subject: [PATCH] md: Allow stripes to be expanded in preparation for expanding an array Before a RAID-5 can be expanded, we need to be able to expand the stripe-cache data structure. This requires allocating new stripes in a new kmem_cache. If this succeeds, we copy cache pages over and release the old stripes and kmem_cache. We then allocate new pages. If that fails, we leave the stripe cache at it's new size. It isn't worth the effort to shrink it back again. Unfortuanately this means we need two kmem_cache names as we, for a short period of time, we have two kmem_caches. So they are raid5/%s and raid5/%s-alt Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/raid5.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 94dbdd406f12..b7b2653af7bb 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -216,7 +216,11 @@ struct raid5_private_data { struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ atomic_t preread_active_stripes; /* stripes with scheduled io */ - char cache_name[20]; + /* unfortunately we need two cache names as we temporarily have + * two caches. + */ + int active_name; + char cache_name[2][20]; kmem_cache_t *slab_cache; /* for allocating stripes */ int seq_flush, seq_write; @@ -238,7 +242,8 @@ struct raid5_private_data { wait_queue_head_t wait_for_overlap; int inactive_blocked; /* release of inactive stripes blocked, * waiting for 25% to be free - */ + */ + int pool_size; /* number of disks in stripeheads in pool */ spinlock_t device_lock; struct disk_info *disks; }; -- cgit v1.2.3 From 7ecaa1e6a1ad69862e9980b6c777e11f26c4782d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:08 -0800 Subject: [PATCH] md: Infrastructure to allow normal IO to continue while array is expanding We need to allow that different stripes are of different effective sizes, and use the appropriate size. Also, when a stripe is being expanded, we must block any IO attempts until the stripe is stable again. Key elements in this change are: - each stripe_head gets a 'disk' field which is part of the key, thus there can sometimes be two stripe heads of the same area of the array, but covering different numbers of devices. One of these will be marked STRIPE_EXPANDING and so won't accept new requests. - conf->expand_progress tracks how the expansion is progressing and is used to determine whether the target part of the array has been expanded yet or not. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/raid5.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index b7b2653af7bb..6fa274aea2a0 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -135,6 +135,7 @@ struct stripe_head { atomic_t count; /* nr of active thread/requests */ spinlock_t lock; int bm_seq; /* sequence number for bitmap flushes */ + int disks; /* disks in stripe */ struct r5dev { struct bio req; struct bio_vec vec; @@ -174,6 +175,7 @@ struct stripe_head { #define STRIPE_DELAYED 6 #define STRIPE_DEGRADED 7 #define STRIPE_BIT_DELAY 8 +#define STRIPE_EXPANDING 9 /* * Plugging: @@ -211,6 +213,10 @@ struct raid5_private_data { int raid_disks, working_disks, failed_disks; int max_nr_stripes; + /* used during an expand */ + sector_t expand_progress; /* MaxSector when no expand happening */ + int previous_raid_disks; + struct list_head handle_list; /* stripes needing handling */ struct list_head delayed_list; /* stripes that have plugged requests */ struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ -- cgit v1.2.3 From ccfcc3c10b2a5cb8fd3c918199a4ff904fc6fb3e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:09 -0800 Subject: [PATCH] md: Core of raid5 resize process This patch provides the core of the resize/expand process. sync_request notices if a 'reshape' is happening and acts accordingly. It allocated new stripe_heads for the next chunk-wide-stripe in the target geometry, marking them STRIPE_EXPANDING. Then it finds which stripe heads in the old geometry can provide data needed by these and marks them STRIPE_EXPAND_SOURCE. This causes stripe_handle to read all blocks on those stripes. Once all blocks on a STRIPE_EXPAND_SOURCE stripe_head are read, any that are needed are copied into the corresponding STRIPE_EXPANDING stripe_head. Once a STRIPE_EXPANDING stripe_head is full, it is marks STRIPE_EXPAND_READY and then is written out and released. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 4 ++++ include/linux/raid/raid5.h | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 617b9506c760..4e26ef2cacca 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -157,6 +157,9 @@ struct mddev_s * DONE: thread is done and is waiting to be reaped * REQUEST: user-space has requested a sync (used with SYNC) * CHECK: user-space request for for check-only, no repair + * RESHAPE: A reshape is happening + * + * If neither SYNC or RESHAPE are set, then it is a recovery. */ #define MD_RECOVERY_RUNNING 0 #define MD_RECOVERY_SYNC 1 @@ -166,6 +169,7 @@ struct mddev_s #define MD_RECOVERY_NEEDED 5 #define MD_RECOVERY_REQUESTED 6 #define MD_RECOVERY_CHECK 7 +#define MD_RECOVERY_RESHAPE 8 unsigned long recovery; int in_sync; /* know to not need resync */ diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 6fa274aea2a0..55c738d50508 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -157,6 +157,7 @@ struct stripe_head { #define R5_ReadError 8 /* seen a read error here recently */ #define R5_ReWrite 9 /* have tried to over-write the readerror */ +#define R5_Expanded 10 /* This block now has post-expand data */ /* * Write method */ @@ -176,7 +177,8 @@ struct stripe_head { #define STRIPE_DEGRADED 7 #define STRIPE_BIT_DELAY 8 #define STRIPE_EXPANDING 9 - +#define STRIPE_EXPAND_SOURCE 10 +#define STRIPE_EXPAND_READY 11 /* * Plugging: * -- cgit v1.2.3 From 292695531ae4019bb15deedc121b218d1908b648 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:10 -0800 Subject: [PATCH] md: Final stages of raid5 expand code This patch adds raid5_reshape and end_reshape which will start and finish the reshape processes. raid5_reshape is only enabled in CONFIG_MD_RAID5_RESHAPE is set, to discourage accidental use. Read the 'help' for the CONFIG_MD_RAID5_RESHAPE entry. and Make sure that you have backups, just in case. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index b6e0bcad84e1..9c77cde5a795 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -92,7 +92,8 @@ extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, extern void md_super_wait(mddev_t *mddev); extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, struct page *page, int rw); - +extern void md_do_sync(mddev_t *mddev); +extern void md_new_event(mddev_t *mddev); #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } -- cgit v1.2.3 From f67055780caac6a99f43834795c43acf99eba6a6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:11 -0800 Subject: [PATCH] md: Checkpoint and allow restart of raid5 reshape We allow the superblock to record an 'old' and a 'new' geometry, and a position where any conversion is up to. The geometry allows for changing chunksize, layout and level as well as number of devices. When using verion-0.90 superblock, we convert the version to 0.91 while the conversion is happening so that an old kernel will refuse the assemble the array. For version-1, we use a feature bit for the same effect. When starting an array we check for an incomplete reshape and restart the reshape process if needed. If the reshape stopped at an awkward time (like when updating the first stripe) we refuse to assemble the array, and let user-space worry about it. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md.h | 2 ++ include/linux/raid/md_k.h | 8 ++++++++ include/linux/raid/md_p.h | 32 +++++++++++++++++++++++++++++--- include/linux/raid/raid5.h | 1 + 4 files changed, 40 insertions(+), 3 deletions(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h index 9c77cde5a795..66b44e5e0d6e 100644 --- a/include/linux/raid/md.h +++ b/include/linux/raid/md.h @@ -95,6 +95,8 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size, extern void md_do_sync(mddev_t *mddev); extern void md_new_event(mddev_t *mddev); +extern void md_update_sb(mddev_t * mddev); + #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } #endif diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 4e26ef2cacca..1a6f9f2f6282 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -132,6 +132,14 @@ struct mddev_s char uuid[16]; + /* If the array is being reshaped, we need to record the + * new shape and an indication of where we are up to. + * This is written to the superblock. + * If reshape_position is MaxSector, then no reshape is happening (yet). + */ + sector_t reshape_position; + int delta_disks, new_level, new_layout, new_chunk; + struct mdk_thread_s *thread; /* management thread */ struct mdk_thread_s *sync_thread; /* doing resync or reconstruct */ sector_t curr_resync; /* blocks scheduled */ diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h index c100fa5d4bfa..774e1acfb8c4 100644 --- a/include/linux/raid/md_p.h +++ b/include/linux/raid/md_p.h @@ -102,6 +102,18 @@ typedef struct mdp_device_descriptor_s { #define MD_SB_ERRORS 1 #define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ + +/* + * Notes: + * - if an array is being reshaped (restriped) in order to change the + * the number of active devices in the array, 'raid_disks' will be + * the larger of the old and new numbers. 'delta_disks' will + * be the "new - old". So if +ve, raid_disks is the new value, and + * "raid_disks-delta_disks" is the old. If -ve, raid_disks is the + * old value and "raid_disks+delta_disks" is the new (smaller) value. + */ + + typedef struct mdp_superblock_s { /* * Constant generic information @@ -146,7 +158,13 @@ typedef struct mdp_superblock_s { __u32 cp_events_hi; /* 10 high-order of checkpoint update count */ #endif __u32 recovery_cp; /* 11 recovery checkpoint sector count */ - __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 12]; + /* There are only valid for minor_version > 90 */ + __u64 reshape_position; /* 12,13 next address in array-space for reshape */ + __u32 new_level; /* 14 new level we are reshaping to */ + __u32 delta_disks; /* 15 change in number of raid_disks */ + __u32 new_layout; /* 16 new layout */ + __u32 new_chunk; /* 17 new chunk size (bytes) */ + __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 18]; /* * Personality information @@ -207,7 +225,14 @@ struct mdp_superblock_1 { * NOTE: signed, so bitmap can be before superblock * only meaningful of feature_map[0] is set. */ - __u8 pad1[128-100]; /* set to 0 when written */ + + /* These are only valid with feature bit '4' */ + __u64 reshape_position; /* next address in array-space for reshape */ + __u32 new_level; /* new level we are reshaping to */ + __u32 delta_disks; /* change in number of raid_disks */ + __u32 new_layout; /* new layout */ + __u32 new_chunk; /* new chunk size (bytes) */ + __u8 pad1[128-124]; /* set to 0 when written */ /* constant this-device information - 64 bytes */ __u64 data_offset; /* sector start of data, often 0 */ @@ -240,8 +265,9 @@ struct mdp_superblock_1 { /* feature_map bits */ #define MD_FEATURE_BITMAP_OFFSET 1 +#define MD_FEATURE_RESHAPE_ACTIVE 4 -#define MD_FEATURE_ALL 1 +#define MD_FEATURE_ALL 5 #endif diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 55c738d50508..abcdf0d0658a 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -224,6 +224,7 @@ struct raid5_private_data { struct list_head bitmap_list; /* stripes delaying awaiting bitmap update */ atomic_t preread_active_stripes; /* stripes with scheduled io */ + atomic_t reshape_stripes; /* stripes with pending writes for reshape */ /* unfortunately we need two cache names as we temporarily have * two caches. */ -- cgit v1.2.3 From b578d55fdd80140f657130abd85aebeb345755fb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:12 -0800 Subject: [PATCH] md: Only checkpoint expansion progress occasionally Instead of checkpointing at each stripe, only checkpoint when a new write would overwrite uncheckpointed data. Block any write to the uncheckpointed area. Arbitrarily checkpoint at least every 3Meg. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/raid5.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index abcdf0d0658a..914af667044f 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -217,6 +217,9 @@ struct raid5_private_data { /* used during an expand */ sector_t expand_progress; /* MaxSector when no expand happening */ + sector_t expand_lo; /* from here up to expand_progress it out-of-bounds + * as we haven't flushed the metadata yet + */ int previous_raid_disks; struct list_head handle_list; /* stripes needing handling */ -- cgit v1.2.3 From 63c70c4f3a30e77e6f445bd16eff7934a031ebd3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:13 -0800 Subject: [PATCH] md: Split reshape handler in check_reshape and start_reshape check_reshape checks validity and does things that can be done instantly - like adding devices to raid1. start_reshape initiates a restriping process to convert the whole array. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 1a6f9f2f6282..002ee631fabb 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -261,7 +261,8 @@ struct mdk_personality int (*spare_active) (mddev_t *mddev); sector_t (*sync_request)(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster); int (*resize) (mddev_t *mddev, sector_t sectors); - int (*reshape) (mddev_t *mddev, int raid_disks); + int (*check_reshape) (mddev_t *mddev); + int (*start_reshape) (mddev_t *mddev); int (*reconfig) (mddev_t *mddev, int layout, int chunk_size); /* quiesce moves between quiescence states * 0 - fully active -- cgit v1.2.3 From e464eafdb4400c6d6576ba3840d8bd40340f8a96 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:14 -0800 Subject: [PATCH] md: Support suspending of IO to regions of an md array This allows user-space to access data safely. This is needed for raid5 reshape as user-space needs to take a backup of the first few stripes before allowing reshape to commence. It will also be useful in cluster-aware raid1 configurations so that all cluster members can leave a section of the array untouched while a resync/recovery happens. A 'start' and 'end' of the suspended range are written to 2 sysfs attributes. Note that only one range can be suspended at a time. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 002ee631fabb..c0d3097846a7 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -151,6 +151,10 @@ struct mddev_s sector_t resync_mismatches; /* count of sectors where * parity/replica mismatch found */ + + /* allow user-space to request suspension of IO to regions of the array */ + sector_t suspend_lo; + sector_t suspend_hi; /* if zero, use the system-wide default */ int sync_speed_min; int sync_speed_max; -- cgit v1.2.3 From df5b89b323b922f56650b4b4d7c41899b937cf19 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:20 -0800 Subject: [PATCH] md: Convert reconfig_sem to reconfig_mutex ... being careful that mutex_trylock is inverted wrt down_trylock Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/raid/md_k.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/raid') diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index c0d3097846a7..e2df61f5b09a 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -185,7 +185,7 @@ struct mddev_s unsigned long recovery; int in_sync; /* know to not need resync */ - struct semaphore reconfig_sem; + struct mutex reconfig_mutex; atomic_t active; int changed; /* true if we might need to reread partition info */ -- cgit v1.2.3