From 5bf696dad4beecb6174e701c97e1f2574e6a2c96 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Sep 2011 11:39:59 +0300 Subject: exofs: Rename struct ore_components comps => oc ore_components already has a comps member so this leads to things like comps->comps which is annoying. the name oc was already used in new code. So rename all old usage of ore_components comps => ore_components oc. Signed-off-by: Boaz Harrosh --- include/scsi/osd_ore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index c5c5e008e6de..954292a23767 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -64,7 +64,7 @@ struct ore_io_state { ore_io_done_fn done; struct ore_layout *layout; - struct ore_components *comps; + struct ore_components *oc; /* Global read/write IO*/ loff_t offset; -- cgit v1.2.3 From 8d2d83a8352b0f9c1da82c36f741722f2960feea Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 10 Aug 2011 14:15:02 -0700 Subject: exofs: Remove unused data_map member from exofs_sb_info The struct pnfs_osd_data_map data_map member of exofs_sb_info was never used after mount. In fact all it's members were duplicated by the ore_layout structure. So just remove the duplicated information. Also removed some stupid, but perfectly supported, restrictions on layout parameters. The case where num_devices is not divisible by mirror_count+1 is perfectly fine since the rotating device view will eventually use all the devices it can get. Signed-off-by: Boaz Harrosh Signed-off-by: Benny Halevy --- include/scsi/osd_ore.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 954292a23767..f7fabb478877 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -34,6 +34,8 @@ struct ore_comp { struct ore_layout { /* Our way of looking at the data_map */ + enum pnfs_osd_raid_algorithm4 + raid_algorithm; unsigned stripe_unit; unsigned mirrors_p1; -- cgit v1.2.3 From eb507bc18969f63b8968034144fd69706c492516 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 10 Aug 2011 14:17:28 -0700 Subject: ore: Make ore_striping_info and ore_calc_stripe_info public The struct ore_striping_info will be used later in other structures. And ore_calc_stripe_info as well. Rename them make struct ore_striping_info public. ore_calc_stripe_info is still static, will be made public on first use. Signed-off-by: Boaz Harrosh --- include/scsi/osd_ore.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index f7fabb478877..e4d550faa7c9 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -56,6 +56,14 @@ struct ore_components { struct osd_dev **ods; /* osd_dev array */ }; +struct ore_striping_info { + u64 obj_offset; + u64 group_length; + u64 M; /* for truncate */ + unsigned dev; + unsigned unit_off; +}; + struct ore_io_state; typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private); -- cgit v1.2.3 From d866d875f68fdeae63df334d291fe138dc636d96 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Sep 2011 14:43:09 +0300 Subject: ore/exofs: Change the type of the devices array (API change) In the pNFS obj-LD the device table at the layout level needs to point to a device_cache node, where it is possible and likely that many layouts will point to the same device-nodes. In Exofs we have a more orderly structure where we have a single array of devices that repeats twice for a round-robin view of the device table This patch moves to a model that can be used by the pNFS obj-LD where struct ore_components holds an array of ore_dev-pointers. (ore_dev is newly defined and contains a struct osd_dev *od member) Each pointer in the array of pointers will point to a bigger user-defined dev_struct. That can be accessed by use of the container_of macro. In Exofs an __alloc_dev_table() function allocates the ore_dev-pointers array as well as an exofs_dev array, in one allocation and does the addresses dance to set everything pointing correctly. It still keeps the double allocation trick for the inodes round-robin view of the table. The device table is always allocated dynamically, also for the single device case. So it is unconditionally freed at umount. Signed-off-by: Boaz Harrosh --- include/scsi/osd_ore.h | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index e4d550faa7c9..8fefdfbb1ced 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -44,6 +44,10 @@ struct ore_layout { unsigned group_count; }; +struct ore_dev { + struct osd_dev *od; +}; + struct ore_components { unsigned numdevs; /* Num of devices in array */ /* If @single_comp == EC_SINGLE_COMP, @comps points to a single @@ -53,9 +57,29 @@ struct ore_components { EC_SINGLE_COMP = 0, EC_MULTPLE_COMPS = 0xffffffff } single_comp; struct ore_comp *comps; - struct osd_dev **ods; /* osd_dev array */ + + /* Array of pointers to ore_dev-* . User will usually have these pointed + * too a bigger struct which contain an "ore_dev ored" member and use + * container_of(oc->ods[i], struct foo_dev, ored) to access the bigger + * structure. + */ + struct ore_dev **ods; }; +/* ore_comp_dev Recievies a logical device index */ +static inline struct osd_dev *ore_comp_dev( + const struct ore_components *oc, unsigned i) +{ + BUG_ON(oc->numdevs <= i); + return oc->ods[i]->od; +} + +static inline void ore_comp_set_dev( + struct ore_components *oc, unsigned i, struct osd_dev *od) +{ + oc->ods[i]->od = od; +} + struct ore_striping_info { u64 obj_offset; u64 group_length; -- cgit v1.2.3 From 98260754046eee4cc7d75751a4a20182ade39f58 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 2 Oct 2011 15:32:50 +0200 Subject: ore: cleanup: Embed an ore_striping_info inside ore_io_state Now that each ore_io_state covers only a single raid group. A single striping_info math is needed. Embed one inside ore_io_state to cache the calculation results and eliminate an extra call. Also the outer _prepare_for_striping is removed since it does nothing. Signed-off-by: Boaz Harrosh --- include/scsi/osd_ore.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 8fefdfbb1ced..baeef0200a1f 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -93,6 +93,7 @@ typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private); struct ore_io_state { struct kref kref; + struct ore_striping_info si; void *private; ore_io_done_fn done; -- cgit v1.2.3 From 3bd9856857339d7ee8c4ad50030583f1b9415c39 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Sep 2011 12:04:23 +0300 Subject: ore: Support for partial component table Users like the objlayout-driver would like to only pass a partial device table that covers the IO in question. For example exofs divides the file into raid-group-sized chunks and only serves group_width number of devices at a time. The partiality is communicated by setting ore_componets->first_dev and the array covers all logical devices from oc->first_dev upto (oc->first_dev + oc->numdevs) The ore_comp_dev() API receives a logical device index and returns the actual present device in the table. An out-of-range dev_index will BUG. Logical device index is the theoretical device index as if all the devices of a file are present. .i.e: total_devs = group_width * mirror_p1 * group_count 0 <= dev_index < total_devs Signed-off-by: Boaz Harrosh --- include/scsi/osd_ore.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index baeef0200a1f..492b70d43bb6 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -49,6 +49,7 @@ struct ore_dev { }; struct ore_components { + unsigned first_dev; /* First logical device no */ unsigned numdevs; /* Num of devices in array */ /* If @single_comp == EC_SINGLE_COMP, @comps points to a single * component. else there are @numdevs components @@ -70,14 +71,14 @@ struct ore_components { static inline struct osd_dev *ore_comp_dev( const struct ore_components *oc, unsigned i) { - BUG_ON(oc->numdevs <= i); - return oc->ods[i]->od; + BUG_ON((i < oc->first_dev) || (oc->first_dev + oc->numdevs <= i)); + return oc->ods[i - oc->first_dev]->od; } static inline void ore_comp_set_dev( struct ore_components *oc, unsigned i, struct osd_dev *od) { - oc->ods[i]->od = od; + oc->ods[i - oc->first_dev]->od = od; } struct ore_striping_info { -- cgit v1.2.3 From 5a51c0c7e9a913649aa65d8233470682bcbb7694 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Sep 2011 13:18:45 +0300 Subject: ore/exofs: Define new ore_verify_layout All users of the ore will need to check if current code supports the given layout. For example RAID5/6 is not currently supported. So move all the checks from exofs/super.c to a new ore_verify_layout() to be used by ore users. Note that any new layout should be passed through the ore_verify_layout() because the ore engine will prepare and verify some internal members of ore_layout, and assumes it's called. Signed-off-by: Boaz Harrosh --- include/scsi/osd_ore.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 492b70d43bb6..716dbeae8cd2 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -42,6 +42,13 @@ struct ore_layout { unsigned group_width; u64 group_depth; unsigned group_count; + + /* Cached often needed calculations filled in by + * ore_verify_layout + */ + unsigned long max_io_length; /* Max length that should be passed to + * ore_get_rw_state + */ }; struct ore_dev { @@ -138,6 +145,7 @@ static inline unsigned ore_io_state_size(unsigned numdevs) } /* ore.c */ +int ore_verify_layout(unsigned total_comps, struct ore_layout *layout); int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, bool is_reading, u64 offset, u64 length, struct ore_io_state **ios); -- cgit v1.2.3 From 4b46c9f5cf69505f0bc708995b88b0cc60317ffd Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 28 Sep 2011 13:25:50 +0300 Subject: ore/exofs: Change ore_check_io API Current ore_check_io API receives a residual pointer, to report partial IO. But it is actually not used, because in a multiple devices IO there is never a linearity in the IO failure. On the other hand if every failing device is reported through a received callback measures can be taken to handle only failed devices. One at a time. This will also be needed by the objects-layout-driver for it's error reporting facility. Exofs is not currently using the new information and keeps the old behaviour of failing the complete IO in case of an error. (No partial completion) TODO: Use an ore_check_io callback to set_page_error only the failing pages. And re-dirty write pages. Signed-off-by: Boaz Harrosh --- include/scsi/osd_ore.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 716dbeae8cd2..af2231a0fd09 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -153,7 +153,10 @@ int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps, struct ore_io_state **ios); void ore_put_io_state(struct ore_io_state *ios); -int ore_check_io(struct ore_io_state *ios, u64 *resid); +typedef void (*ore_on_dev_error)(struct ore_io_state *ios, struct ore_dev *od, + unsigned dev_index, enum osd_err_priority oep, + u64 dev_offset, u64 dev_len); +int ore_check_io(struct ore_io_state *ios, ore_on_dev_error rep); int ore_create(struct ore_io_state *ios); int ore_remove(struct ore_io_state *ios); -- cgit v1.2.3 From 611d7a5dc6f2a1a0cfd8cc07b9d15f794cbe5f98 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Tue, 4 Oct 2011 14:20:17 +0200 Subject: ore: Make ore_calc_stripe_info EXPORT_SYMBOL ore_calc_stripe_info is needed by exofs::export.c for the layout calculations. Make it exportable Signed-off-by: Boaz Harrosh --- include/scsi/osd_ore.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index af2231a0fd09..a8e39d14f82b 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -146,6 +146,9 @@ static inline unsigned ore_io_state_size(unsigned numdevs) /* ore.c */ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout); +void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, + struct ore_striping_info *si); + int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, bool is_reading, u64 offset, u64 length, struct ore_io_state **ios); -- cgit v1.2.3 From a1fec1dbbc8db974d2582e4040590cebe72171e4 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 12 Oct 2011 18:42:22 +0200 Subject: ore: RAID5 read This patch introduces the first stage of RAID5 support mainly the skip-over-raid-units when reading. For writes it inserts BLANK units, into where XOR blocks should be calculated and written to. It introduces the new "general raid maths", and the main additional parameters and components needed for raid5. Since at this stage it could corrupt future version that actually do support raid5. The enablement of raid5 mounting and setting of parity-count > 0 is disabled. So the raid5 code will never be used. Mounting of raid5 is only enabled later once the basic XOR write is also in. But if the patch "enable RAID5" is applied this code has been tested to be able to properly read raid5 volumes and is according to standard. Also it has been tested that the new maths still properly supports RAID0 and grouping code just as before. (BTW: I have found more bugs in the pnfs-obj RAID math fixed here) The ore.c file is getting too big, so new ore_raid.[hc] files are added that will include the special raid stuff that are not used in striping and mirrors. In future write support these will get bigger. When adding the ore_raid.c to Kbuild file I was forced to rename ore.ko to libore.ko. Is it possible to keep source file, say ore.c and module file ore.ko the same even if there are multiple files inside ore.ko? Signed-off-by: Boaz Harrosh --- include/scsi/osd_ore.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index a8e39d14f82b..43821c18cd3f 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -40,6 +40,7 @@ struct ore_layout { unsigned mirrors_p1; unsigned group_width; + unsigned parity; u64 group_depth; unsigned group_count; @@ -89,11 +90,16 @@ static inline void ore_comp_set_dev( } struct ore_striping_info { + u64 offset; u64 obj_offset; - u64 group_length; + u64 length; + u64 first_stripe_start; /* only used in raid writes */ u64 M; /* for truncate */ + unsigned bytes_in_stripe; unsigned dev; + unsigned par_dev; unsigned unit_off; + unsigned cur_comp; }; struct ore_io_state; @@ -127,6 +133,13 @@ struct ore_io_state { bool reading; + /* House keeping of Parity pages */ + bool extra_part_alloc; + struct page **parity_pages; + unsigned max_par_pages; + unsigned cur_par_page; + unsigned sgs_per_dev; + /* Variable array of size numdevs */ unsigned numdevs; struct ore_per_dev_state { @@ -134,7 +147,10 @@ struct ore_io_state { struct bio *bio; loff_t offset; unsigned length; + unsigned last_sgs_total; unsigned dev; + struct osd_sg_entry *sglist; + unsigned cur_sg; } per_dev[]; }; @@ -147,8 +163,7 @@ static inline unsigned ore_io_state_size(unsigned numdevs) /* ore.c */ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout); void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, - struct ore_striping_info *si); - + u64 length, struct ore_striping_info *si); int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, bool is_reading, u64 offset, u64 length, struct ore_io_state **ios); -- cgit v1.2.3 From 769ba8d92025fa390f3097e658b8ed6e032d68e9 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Fri, 14 Oct 2011 15:33:51 +0200 Subject: ore: RAID5 Write This is finally the RAID5 Write support. The bigger part of this patch is not the XOR engine itself, But the read4write logic, which is a complete mini prepare_for_striping reading engine that can read scattered pages of a stripe into cache so it can be used for XOR calculation. That is, if the write was not stripe aligned. The main algorithm behind the XOR engine is the 2 dimensional array: struct __stripe_pages_2d. A drawing might save 1000 words --- __stripe_pages_2d | n = pages_in_stripe_unit; w = group_width - parity; | pages array presented to the XOR lib | | V | __1_page_stripe[0].pages --> [c0][c1]..[cw][c_par] <---| | | __1_page_stripe[1].pages --> [c0][c1]..[cw][c_par] <--- | ... | ... | __1_page_stripe[n].pages --> [c0][c1]..[cw][c_par] ^ | data added columns first then row --- The pages are put on this array columns first. .i.e: p0-of-c0, p1-of-c0, ... pn-of-c0, p0-of-c1, ... So we are doing a corner turn of the pages. Note that pages will zigzag down and left. but are put sequentially in growing order. So when the time comes to XOR the stripe, only the beginning and end of the array need be checked. We scan the array and any NULL spot will be field by pages-to-be-read. The FS that wants to support RAID5 needs to supply an operations-vector that searches a given page in cache, and specifies if the page is uptodate or need reading. All these pages to be read are put on a slave ore_io_state and synchronously read. All the pages of a stripe are read in one IO, using the scatter gather mechanism. In write we constrain our IO to only be incomplete on a single stripe. Meaning either the complete IO is within a single stripe so we might have pages to read from both beginning or end of the strip. Or we have some reading to do at beginning but end at strip boundary. The left over pages are pushed to the next IO by the API already established by previous work, where an IO offset/length combination presented to the ORE might get the length truncated and the user must re-submit the leftover pages. (Both exofs and NFS support this) But any ORE user should make it's best effort to align it's IO before hand and avoid complications. A cached ore_layout->stripe_size member can be used for that calculation. (NOTE: that ORE demands that stripe_size may not be bigger then 32bit) What else? Well read it and tell me. Signed-off-by: Boaz Harrosh --- include/scsi/osd_ore.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 43821c18cd3f..f05fa826f89e 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -99,11 +99,17 @@ struct ore_striping_info { unsigned dev; unsigned par_dev; unsigned unit_off; + unsigned cur_pg; unsigned cur_comp; }; struct ore_io_state; typedef void (*ore_io_done_fn)(struct ore_io_state *ios, void *private); +struct _ore_r4w_op { + /* @Priv given here is passed ios->private */ + struct page * (*get_page)(void *priv, u64 page_index, bool *uptodate); + void (*put_page)(void *priv, struct page *page); +}; struct ore_io_state { struct kref kref; @@ -139,6 +145,9 @@ struct ore_io_state { unsigned max_par_pages; unsigned cur_par_page; unsigned sgs_per_dev; + struct __stripe_pages_2d *sp2d; + struct ore_io_state *ios_read_4_write; + const struct _ore_r4w_op *r4w; /* Variable array of size numdevs */ unsigned numdevs; -- cgit v1.2.3