From 07c09b725543ff2958c11522d583f90f7fdba735 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 15 Feb 2013 22:10:17 -0600 Subject: libceph: make ceph_msg->bio_seg be unsigned The bio_seg field is used by the ceph messenger in iterating through a bio. It should never have a negative value, so make it an unsigned. (I contemplated making it unsigned short to match the struct bio definition, but it offered no benefit.) Change variables used to hold bio_seg values to all be unsigned as well. Change two variable names in init_bio_iter() to match the convention used everywhere else. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 60903e0f665c..8297288a66e0 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -86,7 +86,7 @@ struct ceph_msg { #ifdef CONFIG_BLOCK struct bio *bio; /* instead of pages/pagelist */ struct bio *bio_iter; /* bio iterator */ - int bio_seg; /* current bio segment */ + unsigned int bio_seg; /* current bio segment */ #endif /* CONFIG_BLOCK */ struct ceph_pagelist *trail; /* the trailing part of the data */ bool front_is_vmalloc; -- cgit v1.2.3 From d4b515fa10dd52a2aef88df7299e9f3a8ab0957a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 25 Feb 2013 17:35:46 -0600 Subject: libceph: distinguish page array and pagelist count Use distinct fields for tracking the number of pages in a message's page array and in a message's page list. Currently only one or the other is used at a time, but that will be changing soon. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 8297288a66e0..1b08349a413c 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -75,9 +75,10 @@ struct ceph_msg { struct kvec front; /* unaligned blobs of message */ struct ceph_buffer *middle; struct page **pages; /* data payload. NOT OWNER. */ - unsigned nr_pages; /* size of page array */ + unsigned page_count; /* size of page array */ unsigned page_alignment; /* io offset in first page */ struct ceph_pagelist *pagelist; /* instead of pages */ + unsigned int pagelist_count; /* number of pages in pagelist */ struct ceph_connection *con; struct list_head list_head; -- cgit v1.2.3 From 0d5af1643535508f82d6bcc2b9b93b180e8c3f4b Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 27 Feb 2013 10:26:25 -0600 Subject: libceph: complete lingering requests only once An osd request marked to linger will be re-submitted in the event a connection to the target osd gets dropped. Currently, if there is a callback function associated with a request it will be called each time a request is submitted--which for lingering requests can be more than once. Change it so a request--including lingering ones--will get completed (from the perspective of the user of the osd client) exactly once. This resolves: http://tracker.ceph.com/issues/3967 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 1dd5d466b6f9..a79f833bba4a 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -85,6 +85,7 @@ struct ceph_osd_request { s32 r_reply_op_result[CEPH_OSD_MAX_OP]; int r_got_reply; int r_linger; + int r_completed; struct ceph_osd_client *r_osdc; struct kref r_kref; -- cgit v1.2.3 From 2a24d1f4bd7995de133c857bfdc77ac82c842300 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 1 Mar 2013 18:00:15 -0600 Subject: libceph: use (void *) for untyped data in osd ops Two of the fields defining osd operations are defined using (char *) while the data they represent are really untyped, not character strings. Change them to have type (void *). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index a79f833bba4a..ec33588194ef 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -184,7 +184,7 @@ struct ceph_osd_req_op { } extent; struct { const char *name; - const char *val; + const void *val; u32 name_len; u32 value_len; __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ @@ -193,7 +193,7 @@ struct ceph_osd_req_op { struct { const char *class_name; const char *method_name; - const char *indata; + const void *indata; u32 indata_len; __u8 class_len; __u8 method_len; -- cgit v1.2.3 From ec02a2f2ffae13e038453ae89592a8c6210f7f4d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 1 Mar 2013 18:00:15 -0600 Subject: libceph: kill ceph_msg->pagelist_count The pagelist_count field is never actually used, so get rid of it. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 1b08349a413c..6c118748a7f8 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -78,7 +78,6 @@ struct ceph_msg { unsigned page_count; /* size of page array */ unsigned page_alignment; /* io offset in first page */ struct ceph_pagelist *pagelist; /* instead of pages */ - unsigned int pagelist_count; /* number of pages in pagelist */ struct ceph_connection *con; struct list_head list_head; -- cgit v1.2.3 From 41766f87f54cc8bef023b4b0550f48753959345a Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 1 Mar 2013 18:00:15 -0600 Subject: libceph: rename ceph_calc_object_layout() The purpose of ceph_calc_object_layout() is to fill in the pool number and seed for a ceph_pg structure provided, based on a given osd map and target object id. Currently that function takes a file layout parameter, but the only thing used out of that is its pool number. Change the function so it takes a pool number rather than the full file layout structure. Only update the ceph_pg if the pool is found in the osd map. Get rid of few useless lines of code from the function while there. Since the function now very clearly just fills in the ceph_pg structure it's provided, rename it ceph_calc_ceph_pg(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osdmap.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index c819190d1642..167daf60c4e8 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -131,10 +131,8 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 *bno, u64 *oxoff, u64 *oxlen); /* calculate mapping of object to a placement group */ -extern int ceph_calc_object_layout(struct ceph_pg *pg, - const char *oid, - struct ceph_file_layout *fl, - struct ceph_osdmap *osdmap); +extern int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, + struct ceph_osdmap *osdmap, uint64_t pool); extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, int *acting); -- cgit v1.2.3 From 153e5167e0e237faaefb7adf82db5748c1452d73 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 1 Mar 2013 18:00:15 -0600 Subject: libceph: don't assign page info in ceph_osdc_new_request() Currently ceph_osdc_new_request() assigns an osd request's r_num_pages and r_alignment fields. The only thing it does after that is call ceph_osdc_build_request(), and that doesn't need those fields to be assigned. Move the assignment of those fields out of ceph_osdc_new_request() and into its caller. As a result, the page_align parameter is no longer used, so get rid of it. Note that in ceph_sync_write(), the value for req->r_num_pages had already been calculated earlier (as num_pages, and fortunately it was computed the same way). So don't bother recomputing it, but because it's not needed earlier, move that calculation after the call to ceph_osdc_new_request(). Hold off making the assignment to r_alignment, doing it instead r_pages and r_num_pages are getting set. Similarly, in start_read(), nr_pages already holds the number of pages in the array (and is calculated the same way), so there's no need to recompute it. Move the assignment of the page alignment down with the others there as well. This and the next few patches are preparation work for: http://tracker.ceph.com/issues/4127 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ec33588194ef..803a9db0b475 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -247,7 +247,7 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, int do_sync, u32 truncate_seq, u64 truncate_size, struct timespec *mtime, - bool use_mempool, int page_align); + bool use_mempool); extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, struct ceph_osd_request *req); -- cgit v1.2.3 From 2794a82a11cfeae0890741b18b0049ddb55ce646 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Feb 2013 12:16:43 -0600 Subject: libceph: separate osd request data info Pull the fields in an osd request structure that define the data for the request out into a separate structure. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 803a9db0b475..600b8278d11e 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -50,6 +50,21 @@ struct ceph_osd { #define CEPH_OSD_MAX_OP 10 +struct ceph_osd_data { + struct { + struct { + struct page **pages; + u32 num_pages; + u32 alignment; + bool pages_from_pool; + bool own_pages; + }; +#ifdef CONFIG_BLOCK + struct bio *bio; +#endif /* CONFIG_BLOCK */ + }; +}; + /* an in-flight request */ struct ceph_osd_request { u64 r_tid; /* unique for this client */ @@ -105,15 +120,8 @@ struct ceph_osd_request { struct ceph_file_layout r_file_layout; struct ceph_snap_context *r_snapc; /* snap context for writes */ - unsigned r_num_pages; /* size of page array (follows) */ - unsigned r_page_alignment; /* io offset in first page */ - struct page **r_pages; /* pages for data payload */ - int r_pages_from_pool; - int r_own_pages; /* if true, i own page list */ -#ifdef CONFIG_BLOCK - struct bio *r_bio; /* instead of pages */ -#endif + struct ceph_osd_data r_data; struct ceph_pagelist r_trail; /* trailing part of the data */ }; -- cgit v1.2.3 From 2ac2b7a6d4976bd6b5dc0751aa77d12d48d3ac4c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Feb 2013 12:16:43 -0600 Subject: libceph: distinguish page and bio requests An osd request uses either pages or a bio list for its data. Use a union to record information about the two, and add a data type tag to select between them. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 600b8278d11e..56604b33dc3c 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -50,8 +50,17 @@ struct ceph_osd { #define CEPH_OSD_MAX_OP 10 +enum ceph_osd_data_type { + CEPH_OSD_DATA_TYPE_NONE, + CEPH_OSD_DATA_TYPE_PAGES, +#ifdef CONFIG_BLOCK + CEPH_OSD_DATA_TYPE_BIO, +#endif /* CONFIG_BLOCK */ +}; + struct ceph_osd_data { - struct { + enum ceph_osd_data_type type; + union { struct { struct page **pages; u32 num_pages; -- cgit v1.2.3 From 0fff87ec798abdb4a99f01cbb0197266bb68c5dc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Feb 2013 12:16:43 -0600 Subject: libceph: separate read and write data An osd request defines information about where data to be read should be placed as well as where data to write comes from. Currently these are represented by common fields. Keep information about data for writing separate from data to be read by splitting these into data_in and data_out fields. This is the key patch in this whole series, in that it actually identifies which osd requests generate outgoing data and which generate incoming data. It's less obvious (currently) that an osd CALL op generates both outgoing and incoming data; that's the focus of some upcoming work. This resolves: http://tracker.ceph.com/issues/4127 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 56604b33dc3c..40e02603723d 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -130,8 +130,9 @@ struct ceph_osd_request { struct ceph_file_layout r_file_layout; struct ceph_snap_context *r_snapc; /* snap context for writes */ - struct ceph_osd_data r_data; - struct ceph_pagelist r_trail; /* trailing part of the data */ + struct ceph_osd_data r_data_in; + struct ceph_osd_data r_data_out; + struct ceph_pagelist r_trail; /* trailing part of data out */ }; struct ceph_osd_event { -- cgit v1.2.3 From 7b11ba37585595034a91df8869414f732466b800 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 18:51:03 -0600 Subject: libceph: define CEPH_MSG_MAX_MIDDLE_LEN This is probably unnecessary but the code read as if it were wrong in read_partial_message(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/libceph.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 29818fc3fa49..5493d7b86423 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -66,6 +66,7 @@ struct ceph_options { #define CEPH_OSD_IDLE_TTL_DEFAULT 60 #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) +#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) #define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) #define CEPH_AUTH_NAME_DEFAULT "guest" -- cgit v1.2.3 From e0c594878e3211b09208c779df5f996f0b831d9e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 7 Mar 2013 15:38:25 -0600 Subject: libceph: record byte count not page count Record the byte count for an osd request rather than the page count. The number of pages can always be derived from the byte count (and alignment/offset) but the reverse is not true. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 40e02603723d..a8016dfbfdba 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -63,7 +63,7 @@ struct ceph_osd_data { union { struct { struct page **pages; - u32 num_pages; + u64 length; u32 alignment; bool pages_from_pool; bool own_pages; -- cgit v1.2.3 From 02afca6ca00b7972887c5cc77068356f33bdfc18 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Feb 2013 12:16:43 -0600 Subject: libceph: isolate message page field manipulation Define a function ceph_msg_data_set_pages(), which more clearly abstracts the assignment page-related fields for data in a ceph message structure. Use this new function in the osd client and mds client. Ideally, these fields would never be set more than once (with BUG_ON() calls to guarantee that). At the moment though the osd client sets these every time it receives a message, and in the event of a communication problem this can happen more than once. (This will be resolved shortly, but setting up these helpers first makes it all a bit easier to work with.) Rearrange the field order in a ceph_msg structure to group those that are used to define the possible data payloads. This partially resolves: http://tracker.ceph.com/issues/4263 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 6c118748a7f8..aa463b9b30af 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -74,21 +74,22 @@ struct ceph_msg { struct ceph_msg_footer footer; /* footer */ struct kvec front; /* unaligned blobs of message */ struct ceph_buffer *middle; - struct page **pages; /* data payload. NOT OWNER. */ - unsigned page_count; /* size of page array */ - unsigned page_alignment; /* io offset in first page */ - struct ceph_pagelist *pagelist; /* instead of pages */ - - struct ceph_connection *con; - struct list_head list_head; - struct kref kref; + struct page **pages; /* data payload. NOT OWNER. */ + unsigned int page_alignment; /* io offset in first page */ + unsigned int page_count; /* # pages in array or list */ + struct ceph_pagelist *pagelist; /* instead of pages */ #ifdef CONFIG_BLOCK + unsigned int bio_seg; /* current bio segment */ struct bio *bio; /* instead of pages/pagelist */ struct bio *bio_iter; /* bio iterator */ - unsigned int bio_seg; /* current bio segment */ #endif /* CONFIG_BLOCK */ struct ceph_pagelist *trail; /* the trailing part of the data */ + + struct ceph_connection *con; + struct list_head list_head; /* links for connection lists */ + + struct kref kref; bool front_is_vmalloc; bool more_to_follow; bool needs_out_seq; @@ -218,6 +219,9 @@ extern void ceph_msg_revoke_incoming(struct ceph_msg *msg); extern void ceph_con_keepalive(struct ceph_connection *con); +extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, + unsigned int page_count, size_t alignment); + extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); extern void ceph_msg_kfree(struct ceph_msg *m); -- cgit v1.2.3 From f1baeb2b9fc1c2c87ec02f1bf8cb88e108d4fbce Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 7 Mar 2013 15:38:26 -0600 Subject: libceph: set page info with byte length When setting page array information for message data, provide the byte length rather than the page count ceph_msg_data_set_pages(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index aa463b9b30af..e6d20e892a88 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -220,7 +220,7 @@ extern void ceph_msg_revoke_incoming(struct ceph_msg *msg); extern void ceph_con_keepalive(struct ceph_connection *con); extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, - unsigned int page_count, size_t alignment); + size_t length, size_t alignment); extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); -- cgit v1.2.3 From 27fa83852ba275361eaa1a1283cf6704fa8191a6 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Feb 2013 12:16:43 -0600 Subject: libceph: isolate other message data fields Define ceph_msg_data_set_pagelist(), ceph_msg_data_set_bio(), and ceph_msg_data_set_trail() to clearly abstract the assignment of the remaining data-related fields in a ceph message structure. Use the new functions in the osd client and mds client. This partially resolves: http://tracker.ceph.com/issues/4263 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index e6d20e892a88..9d9be4682ac3 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -221,6 +221,11 @@ extern void ceph_con_keepalive(struct ceph_connection *con); extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, size_t length, size_t alignment); +extern void ceph_msg_data_set_pagelist(struct ceph_msg *msg, + struct ceph_pagelist *pagelist); +extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio); +extern void ceph_msg_data_set_trail(struct ceph_msg *msg, + struct ceph_pagelist *trail); extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); -- cgit v1.2.3 From 4a73ef27ad04f1b8ea23eb55e50b20fcc0530a6f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 7 Mar 2013 15:38:26 -0600 Subject: libceph: record message data byte length Record the number of bytes of data in a page array rather than the number of pages in the array. It can be assumed that the page array is of sufficient size to hold the number of bytes indicated (and offset by the indicated alignment). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 9d9be4682ac3..1991a6f9dc90 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -77,7 +77,7 @@ struct ceph_msg { struct page **pages; /* data payload. NOT OWNER. */ unsigned int page_alignment; /* io offset in first page */ - unsigned int page_count; /* # pages in array or list */ + size_t length; /* # data bytes in array or list */ struct ceph_pagelist *pagelist; /* instead of pages */ #ifdef CONFIG_BLOCK unsigned int bio_seg; /* current bio segment */ -- cgit v1.2.3 From 97fb1c7f6637ee61c90b8bc186d464cfd426b063 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 1 Mar 2013 18:00:16 -0600 Subject: libceph: define ceph_msg_has_*() data macros Define and use macros ceph_msg_has_*() to determine whether to operate on the pages, pagelist, bio, and trail fields of a message. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 1991a6f9dc90..889fe4720133 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -64,6 +64,13 @@ struct ceph_messenger { u32 required_features; }; +#define ceph_msg_has_pages(m) ((m)->pages != NULL) +#define ceph_msg_has_pagelist(m) ((m)->pagelist != NULL) +#ifdef CONFIG_BLOCK +#define ceph_msg_has_bio(m) ((m)->bio != NULL) +#endif /* CONFIG_BLOCK */ +#define ceph_msg_has_trail(m) ((m)->trail != NULL) + /* * a single message. it contains a header (src, dest, message type, etc.), * footer (crc values, mainly), a "front" message body, and possibly a -- cgit v1.2.3 From f9e15777afd87585f2222dfd446c2e52deb65eba Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 1 Mar 2013 18:00:16 -0600 Subject: libceph: be explicit about message data representation A ceph message has a data payload portion. The memory for that data (either the source of data to send or the location to place data that is received) is specified in several ways. The ceph_msg structure includes fields for all of those ways, but this mispresents the fact that not all of them are used at a time. Specifically, the data in a message can be in: - an array of pages - a list of pages - a list of Linux bios - a second list of pages (the "trail") (The two page lists are currently only ever used for outgoing data.) Impose more structure on the ceph message, making the grouping of some of these fields explicit. Shorten the name of the "page_alignment" field. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 889fe4720133..fb2b18a20c13 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -64,12 +64,12 @@ struct ceph_messenger { u32 required_features; }; -#define ceph_msg_has_pages(m) ((m)->pages != NULL) -#define ceph_msg_has_pagelist(m) ((m)->pagelist != NULL) +#define ceph_msg_has_pages(m) ((m)->p.pages != NULL) +#define ceph_msg_has_pagelist(m) ((m)->l.pagelist != NULL) #ifdef CONFIG_BLOCK -#define ceph_msg_has_bio(m) ((m)->bio != NULL) +#define ceph_msg_has_bio(m) ((m)->b.bio != NULL) #endif /* CONFIG_BLOCK */ -#define ceph_msg_has_trail(m) ((m)->trail != NULL) +#define ceph_msg_has_trail(m) ((m)->t.trail != NULL) /* * a single message. it contains a header (src, dest, message type, etc.), @@ -82,16 +82,25 @@ struct ceph_msg { struct kvec front; /* unaligned blobs of message */ struct ceph_buffer *middle; - struct page **pages; /* data payload. NOT OWNER. */ - unsigned int page_alignment; /* io offset in first page */ - size_t length; /* # data bytes in array or list */ - struct ceph_pagelist *pagelist; /* instead of pages */ + /* data payload */ + struct { + struct page **pages; /* NOT OWNER. */ + size_t length; /* # data bytes in array */ + unsigned int alignment; /* first page */ + } p; + struct { + struct ceph_pagelist *pagelist; + } l; #ifdef CONFIG_BLOCK - unsigned int bio_seg; /* current bio segment */ - struct bio *bio; /* instead of pages/pagelist */ - struct bio *bio_iter; /* bio iterator */ + struct { + struct bio *bio_iter; /* iterator */ + struct bio *bio; + unsigned int bio_seg; /* current seg in bio */ + } b; #endif /* CONFIG_BLOCK */ - struct ceph_pagelist *trail; /* the trailing part of the data */ + struct { + struct ceph_pagelist *trail; /* trailing part of data */ + } t; struct ceph_connection *con; struct list_head list_head; /* links for connection lists */ -- cgit v1.2.3 From 437945094fed0deb1810e8da95465c8f26bc6f80 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 1 Mar 2013 18:00:16 -0600 Subject: libceph: abstract message data Group the types of message data into an abstract structure with a type indicator and a union containing fields appropriate to the type of data it represents. Use this to represent the pages, pagelist, bio, and trail in a ceph message. Verify message data is of type NONE in ceph_msg_data_set_*() routines. Since information about message data of type NONE really should not be interpreted, get rid of the other assertions in those functions. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 71 ++++++++++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 20 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index fb2b18a20c13..5860dd0c2caf 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -64,12 +64,55 @@ struct ceph_messenger { u32 required_features; }; -#define ceph_msg_has_pages(m) ((m)->p.pages != NULL) -#define ceph_msg_has_pagelist(m) ((m)->l.pagelist != NULL) +#define ceph_msg_has_pages(m) ((m)->p.type == CEPH_MSG_DATA_PAGES) +#define ceph_msg_has_pagelist(m) ((m)->l.type == CEPH_MSG_DATA_PAGELIST) #ifdef CONFIG_BLOCK -#define ceph_msg_has_bio(m) ((m)->b.bio != NULL) +#define ceph_msg_has_bio(m) ((m)->b.type == CEPH_MSG_DATA_BIO) #endif /* CONFIG_BLOCK */ -#define ceph_msg_has_trail(m) ((m)->t.trail != NULL) +#define ceph_msg_has_trail(m) ((m)->t.type == CEPH_MSG_DATA_PAGELIST) + +enum ceph_msg_data_type { + CEPH_MSG_DATA_NONE, /* message contains no data payload */ + CEPH_MSG_DATA_PAGES, /* data source/destination is a page array */ + CEPH_MSG_DATA_PAGELIST, /* data source/destination is a pagelist */ +#ifdef CONFIG_BLOCK + CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */ +#endif /* CONFIG_BLOCK */ +}; + +static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) +{ + switch (type) { + case CEPH_MSG_DATA_NONE: + case CEPH_MSG_DATA_PAGES: + case CEPH_MSG_DATA_PAGELIST: +#ifdef CONFIG_BLOCK + case CEPH_MSG_DATA_BIO: +#endif /* CONFIG_BLOCK */ + return true; + default: + return false; + } +} + +struct ceph_msg_data { + enum ceph_msg_data_type type; + union { +#ifdef CONFIG_BLOCK + struct { + struct bio *bio_iter; /* iterator */ + struct bio *bio; + unsigned int bio_seg; /* current seg in bio */ + }; +#endif /* CONFIG_BLOCK */ + struct { + struct page **pages; /* NOT OWNER. */ + size_t length; /* total # bytes */ + unsigned int alignment; /* first page */ + }; + struct ceph_pagelist *pagelist; + }; +}; /* * a single message. it contains a header (src, dest, message type, etc.), @@ -83,24 +126,12 @@ struct ceph_msg { struct ceph_buffer *middle; /* data payload */ - struct { - struct page **pages; /* NOT OWNER. */ - size_t length; /* # data bytes in array */ - unsigned int alignment; /* first page */ - } p; - struct { - struct ceph_pagelist *pagelist; - } l; + struct ceph_msg_data p; /* pages */ + struct ceph_msg_data l; /* pagelist */ #ifdef CONFIG_BLOCK - struct { - struct bio *bio_iter; /* iterator */ - struct bio *bio; - unsigned int bio_seg; /* current seg in bio */ - } b; + struct ceph_msg_data b; /* bio */ #endif /* CONFIG_BLOCK */ - struct { - struct ceph_pagelist *trail; /* trailing part of data */ - } t; + struct ceph_msg_data t; /* trail */ struct ceph_connection *con; struct list_head list_head; /* links for connection lists */ -- cgit v1.2.3 From fe38a2b67bc6b3a60da82a23e9082256a30e39d9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Mar 2013 23:39:39 -0600 Subject: libceph: start defining message data cursor This patch lays out the foundation for using generic routines to manage processing items of message data. For simplicity, we'll start with just the trail portion of a message, because it stands alone and is only present for outgoing data. First some basic concepts. We'll use the term "data item" to represent one of the ceph_msg_data structures associated with a message. There are currently four of those, with single-letter field names p, l, b, and t. A data item is further broken into "pieces" which always lie in a single page. A data item will include a "cursor" that will track state as the memory defined by the item is consumed by sending data from or receiving data into it. We define three routines to manipulate a data item's cursor: the "init" routine; the "next" routine; and the "advance" routine. The "init" routine initializes the cursor so it points at the beginning of the first piece in the item. The "next" routine returns the page, page offset, and length (limited by both the page and item size) of the next unconsumed piece in the item. It also indicates to the caller whether the piece being returned is the last one in the data item. The "advance" routine consumes the requested number of bytes in the item (advancing the cursor). This is used to record the number of bytes from the current piece that were actually sent or received by the network code. It returns an indication of whether the result means the current piece has been fully consumed. This is used by the message send code to determine whether it should calculate the CRC for the next piece processed. The trail of a message is implemented as a ceph pagelist. The routines defined for it will be usable for non-trail pagelist data as well. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 5860dd0c2caf..14862438faff 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -95,6 +95,12 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) } } +struct ceph_msg_data_cursor { + bool last_piece; /* now at last piece of data item */ + struct page *page; /* current page in pagelist */ + size_t offset; /* pagelist bytes consumed */ +}; + struct ceph_msg_data { enum ceph_msg_data_type type; union { @@ -112,6 +118,7 @@ struct ceph_msg_data { }; struct ceph_pagelist *pagelist; }; + struct ceph_msg_data_cursor cursor; /* pagelist only */ }; /* -- cgit v1.2.3 From dd236fcb65d7b6b80c408cb5f66aab55f4594284 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Mar 2013 23:39:39 -0600 Subject: libceph: prepare for other message data item types This just inserts some infrastructure in preparation for handling other types of ceph message data items. No functional changes, just trying to simplify review by separating out some noise. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 14862438faff..716c3fdeb257 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -97,8 +97,12 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) struct ceph_msg_data_cursor { bool last_piece; /* now at last piece of data item */ - struct page *page; /* current page in pagelist */ - size_t offset; /* pagelist bytes consumed */ + union { + struct { /* pagelist */ + struct page *page; /* page from list */ + size_t offset; /* bytes from list */ + }; + }; }; struct ceph_msg_data { -- cgit v1.2.3 From 6aaa4511deb4b0fd776d1153dc63a89cdc024fb8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 6 Mar 2013 23:39:39 -0600 Subject: libceph: implement bio message data item cursor Implement and use cursor routines for bio message data items for outbound message data. (See the previous commit for reasoning in support of the changes in out_msg_pos_next().) Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 716c3fdeb257..76b4645e2dff 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -98,6 +98,13 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) struct ceph_msg_data_cursor { bool last_piece; /* now at last piece of data item */ union { +#ifdef CONFIG_BLOCK + struct { /* bio */ + struct bio *bio; /* bio from list */ + unsigned int vector_index; /* vector from bio */ + unsigned int vector_offset; /* bytes from vector */ + }; +#endif /* CONFIG_BLOCK */ struct { /* pagelist */ struct page *page; /* page from list */ size_t offset; /* bytes from list */ -- cgit v1.2.3 From e766d7b55e10f93c7bab298135a4e90dcc46620d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 7 Mar 2013 15:38:28 -0600 Subject: libceph: implement pages array cursor Implement and use cursor routines for page array message data items for outbound message data. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 76b4645e2dff..b53b9ef65009 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -105,6 +105,12 @@ struct ceph_msg_data_cursor { unsigned int vector_offset; /* bytes from vector */ }; #endif /* CONFIG_BLOCK */ + struct { /* pages */ + size_t resid; /* bytes from array */ + unsigned int page_offset; /* offset in page */ + unsigned short page_index; /* index in array */ + unsigned short page_count; /* pages in array */ + }; struct { /* pagelist */ struct page *page; /* page from list */ size_t offset; /* bytes from list */ -- cgit v1.2.3 From 175face2ba31025b0dcd6da4e711fca7764287fa Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 13:35:36 -0600 Subject: libceph: let osd ops determine request data length The length of outgoing data in an osd request is dependent on the osd ops that are embedded in that request. Each op is encoded into a request message using osd_req_encode_op(), so that should be used to determine the amount of outgoing data implied by the op as it is encoded. Have osd_req_encode_op() return the number of bytes of outgoing data implied by the op being encoded, and accumulate and use that in ceph_osdc_build_request(). As a result, ceph_osdc_build_request() no longer requires its "len" parameter, so get rid of it. Using the sum of the op lengths rather than the length provided is a valid change because: - The only callers of osd ceph_osdc_build_request() are rbd and the osd client (in ceph_osdc_new_request() on behalf of the file system). - When rbd calls it, the length provided is only non-zero for write requests, and in that case the single op has the same length value as what was passed here. - When called from ceph_osdc_new_request(), (it's not all that easy to see, but) the length passed is also always the same as the extent length encoded in its (single) write op if present. This resolves: http://tracker.ceph.com/issues/4406 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index a8016dfbfdba..bcf3f72ec3f8 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -249,8 +249,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * bool use_mempool, gfp_t gfp_flags); -extern void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 len, +extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, unsigned int num_op, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, -- cgit v1.2.3 From 9a5e6d09ddd0cd68ce64c3aa54095e4a0e85b089 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 13:35:36 -0600 Subject: libceph: have osd requests support pagelist data Add support for recording a ceph pagelist as data associated with an osd request. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index bcf3f72ec3f8..cf0ba93426da 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -53,6 +53,7 @@ struct ceph_osd { enum ceph_osd_data_type { CEPH_OSD_DATA_TYPE_NONE, CEPH_OSD_DATA_TYPE_PAGES, + CEPH_OSD_DATA_TYPE_PAGELIST, #ifdef CONFIG_BLOCK CEPH_OSD_DATA_TYPE_BIO, #endif /* CONFIG_BLOCK */ @@ -68,8 +69,9 @@ struct ceph_osd_data { bool pages_from_pool; bool own_pages; }; + struct ceph_pagelist *pagelist; #ifdef CONFIG_BLOCK - struct bio *bio; + struct bio *bio; #endif /* CONFIG_BLOCK */ }; }; -- cgit v1.2.3 From 95e072eb38f99c724739d91a1f12bb8bfe1619b5 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 13:35:36 -0600 Subject: libceph: kill osd request r_trail The osd trail is a pagelist, used only for a CALL osd operation to hold the class and method names, along with any input data for the call. It is only currently used by the rbd client, and when it's used it is the only bit of outbound data in the osd request. Since we already support (non-trail) pagelist data in a message, we can just save this outbound CALL data in the "normal" pagelist rather than the trail, and get rid of the trail entirely. The existing pagelist support depends on the pagelist being dynamically allocated, and ownership of it is passed to the messenger once it's been attached to a message. (That is to say, the messenger releases and frees the pagelist when it's done with it). That means we need to dynamically allocate the pagelist also. Note that we simply assert that the allocation of a pagelist structure succeeds. Appending to a pagelist might require a dynamic allocation, so we're already assuming we won't run into trouble doing so (we're just ignore any failures--and that should be fixed at some point). This resolves: http://tracker.ceph.com/issues/4407 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index cf0ba93426da..1dab291b2dc6 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -134,7 +134,6 @@ struct ceph_osd_request { struct ceph_osd_data r_data_in; struct ceph_osd_data r_data_out; - struct ceph_pagelist r_trail; /* trailing part of data out */ }; struct ceph_osd_event { -- cgit v1.2.3 From 9d2a06c2750177dca5f8d0e89884c1d409d64bbc Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 8 Mar 2013 13:35:36 -0600 Subject: libceph: kill message trail The wart that is the ceph message trail can now be removed, because its only user was the osd client, and the previous patch made that no longer the case. The result allows write_partial_msg_pages() to be simplified considerably. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index b53b9ef65009..0e4536cc46f0 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -69,7 +69,6 @@ struct ceph_messenger { #ifdef CONFIG_BLOCK #define ceph_msg_has_bio(m) ((m)->b.type == CEPH_MSG_DATA_BIO) #endif /* CONFIG_BLOCK */ -#define ceph_msg_has_trail(m) ((m)->t.type == CEPH_MSG_DATA_PAGELIST) enum ceph_msg_data_type { CEPH_MSG_DATA_NONE, /* message contains no data payload */ @@ -155,7 +154,6 @@ struct ceph_msg { #ifdef CONFIG_BLOCK struct ceph_msg_data b; /* bio */ #endif /* CONFIG_BLOCK */ - struct ceph_msg_data t; /* trail */ struct ceph_connection *con; struct list_head list_head; /* links for connection lists */ @@ -295,8 +293,6 @@ extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, extern void ceph_msg_data_set_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist); extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio); -extern void ceph_msg_data_set_trail(struct ceph_msg *msg, - struct ceph_pagelist *trail); extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); -- cgit v1.2.3 From 3a23083bda56850a1dc0e1c6d270b1f5dc789f07 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 25 Mar 2013 08:47:40 -0700 Subject: libceph: implement RECONNECT_SEQ feature This is an old protocol extension that allows the client and server to avoid resending old messages after a reconnect (following a socket error). Instead, the exchange their sequence numbers during the handshake. This avoids sending a bunch of useless data over the socket. It has been supported in the server code since v0.22 (Sep 2010). Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- include/linux/ceph/ceph_features.h | 2 ++ include/linux/ceph/msgr.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 76554cecaab2..4c42080347af 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -41,6 +41,7 @@ */ #define CEPH_FEATURES_SUPPORTED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_OSDENC | \ @@ -51,6 +52,7 @@ #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_OSDENC) diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h index 680d3d648cac..3d94a73b5f30 100644 --- a/include/linux/ceph/msgr.h +++ b/include/linux/ceph/msgr.h @@ -87,6 +87,7 @@ struct ceph_entity_inst { #define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */ #define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */ #define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */ +#define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */ /* -- cgit v1.2.3 From 0bed9b5c523d577378b6f83eab5835fe30c27208 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 25 Mar 2013 10:26:01 -0700 Subject: libceph: add update_authorizer auth method Currently the messenger calls out to a get_authorizer con op, which will create a new authorizer if it doesn't yet have one. In the meantime, when we rotate our service keys, the authorizer doesn't get updated. Eventually it will be rejected by the server on a new connection attempt and get invalidated, and we will then rebuild a new authorizer, but this is not ideal. Instead, if we do have an authorizer, call a new update_authorizer op that will verify that the current authorizer is using the latest secret. If it is not, we will build a new one that does. This avoids the transient failure. This fixes one of the sorry sequence of events for bug http://tracker.ceph.com/issues/4282 Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- include/linux/ceph/auth.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h index d4080f309b56..73e973e70026 100644 --- a/include/linux/ceph/auth.h +++ b/include/linux/ceph/auth.h @@ -52,6 +52,9 @@ struct ceph_auth_client_ops { */ int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *auth); + /* ensure that an existing authorizer is up to date */ + int (*update_authorizer)(struct ceph_auth_client *ac, int peer_type, + struct ceph_auth_handshake *auth); int (*verify_authorizer_reply)(struct ceph_auth_client *ac, struct ceph_authorizer *a, size_t len); void (*destroy_authorizer)(struct ceph_auth_client *ac, -- cgit v1.2.3 From 27859f9773e4a0b2042435b13400ee2c891a61f4 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 25 Mar 2013 10:26:14 -0700 Subject: libceph: wrap auth ops in wrapper functions Use wrapper functions that check whether the auth op exists so that callers do not need a bunch of conditional checks. Simplifies the external interface. Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- include/linux/ceph/auth.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h index 73e973e70026..c9c3b3abe4a3 100644 --- a/include/linux/ceph/auth.h +++ b/include/linux/ceph/auth.h @@ -97,5 +97,18 @@ extern int ceph_build_auth(struct ceph_auth_client *ac, void *msg_buf, size_t msg_len); extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac); +extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac, + int peer_type, + struct ceph_auth_handshake *auth); +extern void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac, + struct ceph_authorizer *a); +extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac, + int peer_type, + struct ceph_auth_handshake *a); +extern int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a, + size_t len); +extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, + int peer_type); #endif -- cgit v1.2.3 From e9966076cdd952e19f2dd4854cd719be0d7cbebc Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 25 Mar 2013 10:26:30 -0700 Subject: libceph: wrap auth methods in a mutex The auth code is called from a variety of contexts, include the mon_client (protected by the monc's mutex) and the messenger callbacks (currently protected by nothing). Avoid chaos by protecting all auth state with a mutex. Nothing is blocking, so this should be simple and lightweight. Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- include/linux/ceph/auth.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h index c9c3b3abe4a3..5f3386844134 100644 --- a/include/linux/ceph/auth.h +++ b/include/linux/ceph/auth.h @@ -78,6 +78,8 @@ struct ceph_auth_client { u64 global_id; /* our unique id in system */ const struct ceph_crypto_key *key; /* our secret key */ unsigned want_keys; /* which services we want */ + + struct mutex mutex; }; extern struct ceph_auth_client *ceph_auth_init(const char *name, -- cgit v1.2.3 From 25aff7c559c8b54a810bc094d59fe037cfed6b18 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:22 -0500 Subject: libceph: record residual bytes for all message data types All of the data types can use this, not just the page array. Until now, only the bio type doesn't have it available, and only the initiator of the request (the rbd client) is able to supply the length of the full request without re-scanning the bio list. Change the cursor init routines so the length is supplied based on the message header "data_len" field, and use that length to intiialize the "resid" field of the cursor. In addition, change the way "last_piece" is defined so it is based on the residual number of bytes in the original request. This is necessary (at least for bio messages) because it is possible for a read request to succeed without consuming all of the space available in the data buffer. This resolves: http://tracker.ceph.com/issues/4427 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 0e4536cc46f0..459e55280bf8 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -95,6 +95,7 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) } struct ceph_msg_data_cursor { + size_t resid; /* bytes not yet consumed */ bool last_piece; /* now at last piece of data item */ union { #ifdef CONFIG_BLOCK @@ -105,7 +106,6 @@ struct ceph_msg_data_cursor { }; #endif /* CONFIG_BLOCK */ struct { /* pages */ - size_t resid; /* bytes from array */ unsigned int page_offset; /* offset in page */ unsigned short page_index; /* index in array */ unsigned short page_count; /* pages in array */ -- cgit v1.2.3 From 6518be47f910f62a98cb6044dbb457af55241f95 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:23 -0500 Subject: libceph: kill ceph message bio_iter, bio_seg The bio_iter and bio_seg fields in a message are no longer used, we use the cursor instead. So get rid of them and the functions that operate on them them. This is related to: http://tracker.ceph.com/issues/4428 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 459e55280bf8..252e01b7f7de 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -121,11 +121,7 @@ struct ceph_msg_data { enum ceph_msg_data_type type; union { #ifdef CONFIG_BLOCK - struct { - struct bio *bio_iter; /* iterator */ - struct bio *bio; - unsigned int bio_seg; /* current seg in bio */ - }; + struct bio *bio; #endif /* CONFIG_BLOCK */ struct { struct page **pages; /* NOT OWNER. */ -- cgit v1.2.3 From 4c59b4a278f9b7a418ad8af933fd7b341df64393 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:23 -0500 Subject: libceph: collapse all data items into one It turns out that only one of the data item types is ever used at any one time in a single message (currently). - A page array is used by the osd client (on behalf of the file system) and by rbd. Only one osd op (and therefore at most one data item) is ever used at a time by rbd. And the only time the file system sends two, the second op contains no data. - A bio is only used by the rbd client (and again, only one data item per message) - A page list is used by the file system and by rbd for outgoing data, but only one op (and one data item) at a time. We can therefore collapse all three of our data item fields into a single field "data", and depend on the messenger code to properly handle it based on its type. This allows us to eliminate quite a bit of duplicated code. This is related to: http://tracker.ceph.com/issues/4429 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 252e01b7f7de..af786b29f7a4 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -64,11 +64,7 @@ struct ceph_messenger { u32 required_features; }; -#define ceph_msg_has_pages(m) ((m)->p.type == CEPH_MSG_DATA_PAGES) -#define ceph_msg_has_pagelist(m) ((m)->l.type == CEPH_MSG_DATA_PAGELIST) -#ifdef CONFIG_BLOCK -#define ceph_msg_has_bio(m) ((m)->b.type == CEPH_MSG_DATA_BIO) -#endif /* CONFIG_BLOCK */ +#define ceph_msg_has_data(m) ((m)->data.type != CEPH_MSG_DATA_NONE) enum ceph_msg_data_type { CEPH_MSG_DATA_NONE, /* message contains no data payload */ @@ -145,11 +141,7 @@ struct ceph_msg { struct ceph_buffer *middle; /* data payload */ - struct ceph_msg_data p; /* pages */ - struct ceph_msg_data l; /* pagelist */ -#ifdef CONFIG_BLOCK - struct ceph_msg_data b; /* bio */ -#endif /* CONFIG_BLOCK */ + struct ceph_msg_data data; struct ceph_connection *con; struct list_head list_head; /* links for connection lists */ -- cgit v1.2.3 From 859a35d5523e8e6a5c3568c12febe2e1270bc3a1 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:23 -0500 Subject: libceph: kill most of ceph_msg_pos All but one of the fields in the ceph_msg_pos structure are now never used (only assigned), so get rid of them. This allows several small blocks of code to go away. This is cleanup of old code related to: http://tracker.ceph.com/issues/4428 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index af786b29f7a4..c76b228cb524 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -157,8 +157,6 @@ struct ceph_msg { }; struct ceph_msg_pos { - int page, page_pos; /* which page; offset in page */ - int data_pos; /* offset in data payload */ bool did_page_crc; /* true if we've calculated crc for current page */ }; -- cgit v1.2.3 From f5db90bcf2c69d099f9d828a8104796f41de6bc5 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:23 -0500 Subject: libceph: kill last of ceph_msg_pos The only remaining field in the ceph_msg_pos structure is did_page_crc. In the new cursor model of things that flag (or something like it) belongs in the cursor. Define a new field "need_crc" in the cursor (which applies to all types of data) and initialize it to true whenever a cursor is initialized. In write_partial_message_data(), the data CRC still will be computed as before, but it will check the cursor->need_crc field to determine whether it's needed. Any time the cursor is advanced to a new piece of a data item, need_crc will be set, and this will cause the crc for that entire piece to be accumulated into the data crc. In write_partial_message_data() the intermediate crc value is now held in a local variable so it doesn't have to be byte-swapped so many times. In read_partial_msg_data() we do something similar (but mainly for consistency there). With that, the ceph_msg_pos structure can go away, and it no longer needs to be passed as an argument to prepare_message_data(). This cleanup is related to: http://tracker.ceph.com/issues/4428 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index c76b228cb524..686df5bfa717 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -93,6 +93,7 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) struct ceph_msg_data_cursor { size_t resid; /* bytes not yet consumed */ bool last_piece; /* now at last piece of data item */ + bool need_crc; /* new piece; crc update needed */ union { #ifdef CONFIG_BLOCK struct { /* bio */ @@ -156,10 +157,6 @@ struct ceph_msg { struct ceph_msgpool *pool; }; -struct ceph_msg_pos { - bool did_page_crc; /* true if we've calculated crc for current page */ -}; - /* ceph connection fault delay defaults, for exponential backoff */ #define BASE_DELAY_INTERVAL (HZ/2) #define MAX_DELAY_INTERVAL (5 * 60 * HZ) @@ -217,7 +214,6 @@ struct ceph_connection { struct ceph_msg *out_msg; /* sending message (== tail of out_sent) */ bool out_msg_done; - struct ceph_msg_pos out_msg_pos; struct kvec out_kvec[8], /* sending header/footer data */ *out_kvec_cur; @@ -231,7 +227,6 @@ struct ceph_connection { /* message in temps */ struct ceph_msg_header in_hdr; struct ceph_msg *in_msg; - struct ceph_msg_pos in_msg_pos; u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */ char in_tag; /* protocol control byte */ -- cgit v1.2.3 From 6644ed7b7e04f8e588aebdaa58cededb9416ab95 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Mar 2013 23:34:24 -0500 Subject: libceph: make message data be a pointer Begin the transition from a single message data item to a list of them by replacing the "data" structure in a message with a pointer to a ceph_msg_data structure. A null pointer will indicate the message has no data; replace the use of ceph_msg_has_data() with a simple check for a null pointer. Create functions ceph_msg_data_create() and ceph_msg_data_destroy() to dynamically allocate and free a data item structure of a given type. When a message has its data item "set," allocate one of these to hold the data description, and free it when the last reference to the message is dropped. This partially resolves: http://tracker.ceph.com/issues/4429 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 686df5bfa717..3181321bed6d 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -64,8 +64,6 @@ struct ceph_messenger { u32 required_features; }; -#define ceph_msg_has_data(m) ((m)->data.type != CEPH_MSG_DATA_NONE) - enum ceph_msg_data_type { CEPH_MSG_DATA_NONE, /* message contains no data payload */ CEPH_MSG_DATA_PAGES, /* data source/destination is a page array */ @@ -141,8 +139,7 @@ struct ceph_msg { struct kvec front; /* unaligned blobs of message */ struct ceph_buffer *middle; - /* data payload */ - struct ceph_msg_data data; + struct ceph_msg_data *data; /* data payload */ struct ceph_connection *con; struct list_head list_head; /* links for connection lists */ -- cgit v1.2.3 From adfe695a25e92e3a4597807fbc7f9a8105218776 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 13 Mar 2013 20:50:00 -0500 Subject: ceph: move max constant definitions Move some definitions for max integer values out of the rbd code and into the more central "decode.h" header file. These really belong in a Linux (or libc) header somewhere, but I haven't gotten around to proposing that yet. This is in preparation for moving some code out of rbd.c and into the osd client. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/decode.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index 360d9d08ca9e..689f1df37bff 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -8,6 +8,13 @@ #include +/* This seemed to be the easiest place to define these */ + +#define U8_MAX ((u8) (~0U)) +#define U16_MAX ((u16) (~0U)) +#define U32_MAX ((u32) (~0U)) +#define U64_MAX ((u64) (~0ULL)) + /* * in all cases, * void **p pointer to position pointer -- cgit v1.2.3 From 33803f3300265661b5c5d20a9811c6a2a157d545 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 13 Mar 2013 20:50:00 -0500 Subject: libceph: define source request op functions The rbd code has a function that allocates and populates a ceph_osd_req_op structure (the in-core version of an osd request operation). When reviewed, Josh suggested two things: that the big varargs function might be better split into type-specific functions; and that this functionality really belongs in the osd client rather than rbd. This patch implements both of Josh's suggestions. It breaks up the rbd function into separate functions and defines them in the osd client module as exported interfaces. Unlike the rbd version, however, the functions don't allocate an osd_req_op structure; they are provided the address of one and that is initialized instead. The rbd function has been eliminated and calls to it have been replaced by calls to the new routines. The rbd code now now use a stack (struct) variable to hold the op rather than allocating and freeing it each time. For now only the capabilities used by rbd are implemented. Implementing all the other osd op types, and making the rest of the code use it will be done separately, in the next few patches. Note that only the extent, cls, and watch portions of the ceph_osd_req_op structure are currently used. Delete the others (xattr, pgls, and snap) from its definition so nobody thinks it's actually implemented or needed. We can add it back again later if needed, when we know it's been tested. This (and a few follow-on patches) resolves: http://tracker.ceph.com/issues/3861 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 1dab291b2dc6..5fd2cbfcfd91 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -201,14 +201,6 @@ struct ceph_osd_req_op { u64 truncate_size; u32 truncate_seq; } extent; - struct { - const char *name; - const void *val; - u32 name_len; - u32 value_len; - __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ - __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ - } xattr; struct { const char *class_name; const char *method_name; @@ -218,13 +210,6 @@ struct ceph_osd_req_op { __u8 method_len; __u8 argc; } cls; - struct { - u64 cookie; - u64 count; - } pgls; - struct { - u64 snapid; - } snap; struct { u64 cookie; u64 ver; @@ -244,6 +229,17 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); +extern void osd_req_op_init(struct ceph_osd_req_op *op, u16 opcode); +extern void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, + u64 offset, u64 length, + u64 truncate_size, u32 truncate_seq); +extern void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, + const char *class, const char *method, + const void *request_data, + size_t request_data_size); +extern void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, + u64 cookie, u64 version, int flag); + extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, unsigned int num_op, -- cgit v1.2.3 From ef4859d6479d19bcc65c3156cf3b7dd747355c29 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 1 Apr 2013 18:58:26 -0500 Subject: libceph: define ceph_decode_pgid() only once There are two basically identical definitions of __decode_pgid() in libceph, one in "net/ceph/osdmap.c" and the other in "net/ceph/osd_client.c". Get rid of both, and instead define a single inline version in "include/linux/ceph/osdmap.h". Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osdmap.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 167daf60c4e8..d05cc4451af6 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -119,6 +120,29 @@ static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, return &map->osd_addr[osd]; } +static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) +{ + __u8 version; + + if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) { + pr_warning("incomplete pg encoding"); + + return -EINVAL; + } + version = ceph_decode_8(p); + if (version > 1) { + pr_warning("do not understand pg encoding %d > 1", + (int)version); + return -EINVAL; + } + + pgid->pool = ceph_decode_64(p); + pgid->seed = ceph_decode_32(p); + *p += 4; /* skip deprecated preferred value */ + + return 0; +} + extern struct ceph_osdmap *osdmap_decode(void **p, void *end); extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_osdmap *map, -- cgit v1.2.3 From ace6d3a96f00c271b3f337adcde8e8cbe39c3820 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 1 Apr 2013 16:12:14 -0500 Subject: libceph: drop ceph_osd_request->r_con_filling_msg A field in an osd request keeps track of whether a connection is currently filling the request's reply message. This patch gets rid of that field. An osd request includes two messages--a request and a reply--and they're both associated with the connection that existed to its the target osd at the time the request was created. An osd request can be dropped early, even when it's in flight. And at that time both messages are released. It's possible the reply message has been supplied to its connection to receive an incoming response message at the time the osd request gets dropped. So ceph_osdc_release_request() revokes that message from the connection before releasing it so things get cleaned up properly. Previously this may have caused a problem, because the connection that a message was associated with might have gone away before the revoke request. And to avoid any problems using that connection, the osd client held a reference to it when it supplies its response message. However since this commit: 38941f80 libceph: have messages point to their connection all messages hold a reference to the connection they are associated with whenever the connection is actively operating on the message (i.e. while the message is queued to send or sending, and when it data is being received into it). And if a message has no connection associated with it, ceph_msg_revoke_incoming() won't do anything when asked to revoke it. As a result, there is no need to keep an additional reference to the connection associated with a message when we hand the message to the messenger when it calls our alloc_msg() method to receive something. If the connection *were* operating on it, it would have its own reference, and if not, there's no work to be done when we need to revoke it. So get rid of the osd request's r_con_filling_msg field. This resolves: http://tracker.ceph.com/issues/4647 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 5fd2cbfcfd91..3b5ba31c2cbd 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -89,8 +89,6 @@ struct ceph_osd_request { int r_pg_osds[CEPH_PG_MAX_SIZE]; int r_num_pg_osds; - struct ceph_connection *r_con_filling_msg; - struct ceph_msg *r_request, *r_reply; int r_flags; /* any additional flags for the osd */ u32 r_sent; /* >0 if r_request is sending/sent */ -- cgit v1.2.3 From fdce58ccb5df621695b079378c619046acabc778 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Mar 2013 14:09:06 -0500 Subject: libceph: record length of bio list with bio When assigning a bio pointer to an osd request, we don't have an efficient way of knowing the total length bytes in the bio list. That information is available at the point it's set up by the rbd code, so record it with the osd data when it's set. This and the next patch are related to maintaining the length of a message's data independent of the message header, as described here: http://tracker.ceph.com/issues/4589 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 3b5ba31c2cbd..fdda93ebbb4c 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -71,7 +71,10 @@ struct ceph_osd_data { }; struct ceph_pagelist *pagelist; #ifdef CONFIG_BLOCK - struct bio *bio; + struct { + struct bio *bio; /* list of bios */ + size_t bio_length; /* total in list */ + }; #endif /* CONFIG_BLOCK */ }; }; -- cgit v1.2.3 From a19308048182d5f9e16b03b1d1c038d9346c7589 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Mar 2013 14:09:06 -0500 Subject: libceph: record message data length Keep track of the length of the data portion for a message in a separate field in the ceph_msg structure. This information has been maintained in wire byte order in the message header, but that's going to change soon. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 3181321bed6d..b832c0ce899a 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -139,6 +139,7 @@ struct ceph_msg { struct kvec front; /* unaligned blobs of message */ struct ceph_buffer *middle; + size_t data_length; struct ceph_msg_data *data; /* data payload */ struct ceph_connection *con; @@ -270,7 +271,8 @@ extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, size_t length, size_t alignment); extern void ceph_msg_data_set_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist); -extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio); +extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio, + size_t length); extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); -- cgit v1.2.3 From acead002b200569273bed331c93c4a91d25e10b8 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Mar 2013 14:09:05 -0500 Subject: libceph: don't build request in ceph_osdc_new_request() This patch moves the call to ceph_osdc_build_request() out of ceph_osdc_new_request() and into its caller. This is in order to defer formatting osd operation information into the request message until just before request is started. The only unusual (ab)user of ceph_osdc_build_request() is ceph_writepages_start(), where the final length of write request may change (downward) based on the current inode size or the oldest snapshot context with dirty data for the inode. The remaining callers don't change anything in the request after has been built. This means the ops array is now supplied by the caller. It also means there is no need to pass the mtime to ceph_osdc_new_request() (it gets provided to ceph_osdc_build_request()). And rather than passing a do_sync flag, have the number of ops in the ops array supplied imply adding a second STARTSYNC operation after the READ or WRITE requested. This and some of the patches that follow are related to having the messenger (only) be responsible for filling the content of the message header, as described here: http://tracker.ceph.com/issues/4589 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index fdda93ebbb4c..ffaf9076fdc4 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -243,12 +243,12 @@ extern void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, - unsigned int num_op, + unsigned int num_ops, bool use_mempool, gfp_t gfp_flags); extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, - unsigned int num_op, + unsigned int num_ops, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, u64 snap_id, @@ -257,11 +257,11 @@ extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, struct ceph_vino vino, - u64 offset, u64 *len, int op, int flags, + u64 offset, u64 *len, + int num_ops, struct ceph_osd_req_op *ops, + int opcode, int flags, struct ceph_snap_context *snapc, - int do_sync, u32 truncate_seq, - u64 truncate_size, - struct timespec *mtime, + u32 truncate_seq, u64 truncate_size, bool use_mempool); extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, -- cgit v1.2.3 From e5975c7c8eb6aeab8d2f76a98c368081082795e0 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Mar 2013 14:09:05 -0500 Subject: ceph: build osd request message later for writepages Hold off building the osd request message in ceph_writepages_start() until just before it will be submitted to the osd client for execution. We'll still create the request and allocate the page pointer array after we learn we have at least one page to write. A local variable will be used to keep track of the allocated array of pages. Wait until just before submitting the request for assigning that page array pointer to the request message. Create ands use a new function osd_req_op_extent_update() whose purpose is to serve this one spot where the length value supplied when an osd request's op was initially formatted might need to get changed (reduced, never increased) before submitting the request. Previously, ceph_writepages_start() assigned the message header's data length because of this update. That's no longer necessary, because ceph_osdc_build_request() will recalculate the right value to use based on the content of the ops in the request. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ffaf9076fdc4..5ee1a3776b4b 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -234,6 +234,7 @@ extern void osd_req_op_init(struct ceph_osd_req_op *op, u16 opcode); extern void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq); +extern void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length); extern void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, const char *class, const char *method, const void *request_data, -- cgit v1.2.3 From 43bfe5de9fa78e07248b70992ce50321efec622c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 3 Apr 2013 01:28:57 -0500 Subject: libceph: define osd data initialization helpers Define and use functions that encapsulate the initializion of a ceph_osd_data structure. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 5ee1a3776b4b..af60dac1f9c0 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -280,6 +280,17 @@ static inline void ceph_osdc_put_request(struct ceph_osd_request *req) kref_put(&req->r_kref, ceph_osdc_release_request); } +extern void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, + struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, + bool own_pages); +extern void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, + struct ceph_pagelist *pagelist); +#ifdef CONFIG_BLOCK +extern void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, + struct bio *bio, size_t bio_length); +#endif /* CONFIG_BLOCK */ + extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req, bool nofail); -- cgit v1.2.3 From 79528734f3ae4699a2886f62f55e18fb34fb3651 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 3 Apr 2013 21:32:51 -0500 Subject: libceph: keep source rather than message osd op array An osd request keeps a pointer to the osd operations (ops) array that it builds in its request message. In order to allow each op in the array to have its own distinct data, we will need to keep track of each op's data, and that information does not go over the wire. As long as we're tracking the data we might as well just track the entire (source) op definition for each of the ops. And if we're doing that, we'll have no more need to keep a pointer to the wire-encoded version. This patch makes the array of source ops be kept with the osd request structure, and uses that instead of the version encoded in the message in places where that was previously used. The array will be embedded in the request structure, and the maximum number of ops we ever actually use is currently 2. So reduce CEPH_OSD_MAX_OP to 2 to reduce the size of the structure. The result of doing this sort of ripples back up, and as a result various function parameters and local variables become unnecessary. Make r_num_ops be unsigned, and move the definition of struct ceph_osd_req_op earlier to ensure it's defined where needed. It does not yet add per-op data, that's coming soon. This resolves: http://tracker.ceph.com/issues/4656 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 70 ++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 36 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index af60dac1f9c0..f4c1a2a22a14 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -48,7 +48,7 @@ struct ceph_osd { }; -#define CEPH_OSD_MAX_OP 10 +#define CEPH_OSD_MAX_OP 2 enum ceph_osd_data_type { CEPH_OSD_DATA_TYPE_NONE, @@ -79,6 +79,34 @@ struct ceph_osd_data { }; }; +struct ceph_osd_req_op { + u16 op; /* CEPH_OSD_OP_* */ + u32 payload_len; + union { + struct { + u64 offset, length; + u64 truncate_size; + u32 truncate_seq; + } extent; + struct { + const char *class_name; + const char *method_name; + const void *indata; + u32 indata_len; + __u8 class_len; + __u8 method_len; + __u8 argc; + } cls; + struct { + u64 cookie; + u64 ver; + u32 prot_ver; + u32 timeout; + __u8 flag; + } watch; + }; +}; + /* an in-flight request */ struct ceph_osd_request { u64 r_tid; /* unique for this client */ @@ -95,10 +123,11 @@ struct ceph_osd_request { struct ceph_msg *r_request, *r_reply; int r_flags; /* any additional flags for the osd */ u32 r_sent; /* >0 if r_request is sending/sent */ - int r_num_ops; - /* encoded message content */ - struct ceph_osd_op *r_request_ops; + /* request osd ops array */ + unsigned int r_num_ops; + struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP]; + /* these are updated on each send */ __le32 *r_request_osdmap_epoch; __le32 *r_request_flags; @@ -193,34 +222,6 @@ struct ceph_osd_client { struct workqueue_struct *notify_wq; }; -struct ceph_osd_req_op { - u16 op; /* CEPH_OSD_OP_* */ - u32 payload_len; - union { - struct { - u64 offset, length; - u64 truncate_size; - u32 truncate_seq; - } extent; - struct { - const char *class_name; - const char *method_name; - const void *indata; - u32 indata_len; - __u8 class_len; - __u8 method_len; - __u8 argc; - } cls; - struct { - u64 cookie; - u64 ver; - u32 prot_ver; - u32 timeout; - __u8 flag; - } watch; - }; -}; - extern int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client); extern void ceph_osdc_stop(struct ceph_osd_client *osdc); @@ -249,8 +250,6 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * gfp_t gfp_flags); extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, - unsigned int num_ops, - struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, u64 snap_id, struct timespec *mtime); @@ -259,8 +258,7 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, struct ceph_file_layout *layout, struct ceph_vino vino, u64 offset, u64 *len, - int num_ops, struct ceph_osd_req_op *ops, - int opcode, int flags, + int num_ops, int opcode, int flags, struct ceph_snap_context *snapc, u32 truncate_seq, u64 truncate_size, bool use_mempool); -- cgit v1.2.3 From 54d5064912649e296552f298e6472ffd37cd8f90 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 3 Apr 2013 01:28:58 -0500 Subject: libceph: rename data out field in osd request op There are fields "indata" and "indata_len" defined the ceph osd request op structure. The "in" part is with from the point of view of the osd server, but is a little confusing here on the client side. Change their names to use "request" instead of "in" to indicate that it defines data provided with the request (as opposed the data returned in the response). Rename the local variable in osd_req_encode_op() to match. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index f4c1a2a22a14..a9c4089894c8 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -91,8 +91,8 @@ struct ceph_osd_req_op { struct { const char *class_name; const char *method_name; - const void *indata; - u32 indata_len; + const void *request_data; + u32 request_data_len; __u8 class_len; __u8 method_len; __u8 argc; -- cgit v1.2.3 From 8c042b0df99cd06ef8473ef6e204b87b3dc80158 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 3 Apr 2013 01:28:58 -0500 Subject: libceph: add data pointers in osd op structures An extent type osd operation currently implies that there will be corresponding data supplied in the data portion of the request (for write) or response (for read) message. Similarly, an osd class method operation implies a data item will be supplied to receive the response data from the operation. Add a ceph_osd_data pointer to each of those structures, and assign it to point to eithre the incoming or the outgoing data structure in the osd message. The data is not always available when an op is initially set up, so add two new functions to allow setting them after the op has been initialized. Begin to make use of the data item pointer available in the osd operation rather than the request data in or out structure in places where it's convenient. Add some assertions to verify pointers are always set the way they're expected to be. This is a sort of stepping stone toward really moving the data into the osd request ops, to allow for some validation before making that jump. This is the first in a series of patches that resolve: http://tracker.ceph.com/issues/4657 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index a9c4089894c8..ae5193550fbf 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -87,12 +87,14 @@ struct ceph_osd_req_op { u64 offset, length; u64 truncate_size; u32 truncate_seq; + struct ceph_osd_data *osd_data; } extent; struct { const char *class_name; const char *method_name; const void *request_data; u32 request_data_len; + struct ceph_osd_data *response_data; __u8 class_len; __u8 method_len; __u8 argc; @@ -236,10 +238,14 @@ extern void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq); extern void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length); +extern void osd_req_op_extent_osd_data(struct ceph_osd_req_op *op, + struct ceph_osd_data *osd_data); extern void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, const char *class, const char *method, const void *request_data, size_t request_data_size); +extern void osd_req_op_cls_response_data(struct ceph_osd_req_op *op, + struct ceph_osd_data *response_data); extern void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, u64 cookie, u64 version, int flag); -- cgit v1.2.3 From c99d2d4abb6c405ef52e9bc1da87b382b8f41739 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:11 -0500 Subject: libceph: specify osd op by index in request An osd request now holds all of its source op structures, and every place that initializes one of these is in fact initializing one of the entries in the the osd request's array. So rather than supplying the address of the op to initialize, have caller specify the osd request and an indication of which op it would like to initialize. This better hides the details the op structure (and faciltates moving the data pointers they use). Since osd_req_op_init() is a common routine, and it's not used outside the osd client code, give it static scope. Also make it return the address of the specified op (so all the other init routines don't have to repeat that code). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ae5193550fbf..144d57cbef9e 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -233,20 +233,25 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); -extern void osd_req_op_init(struct ceph_osd_req_op *op, u16 opcode); -extern void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode, +extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, u64 offset, u64 length, u64 truncate_size, u32 truncate_seq); -extern void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length); -extern void osd_req_op_extent_osd_data(struct ceph_osd_req_op *op, +extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, + unsigned int which, u64 length); +extern void osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_osd_data *osd_data); -extern void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode, +extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, const char *class, const char *method, const void *request_data, size_t request_data_size); -extern void osd_req_op_cls_response_data(struct ceph_osd_req_op *op, +extern void osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_osd_data *response_data); -extern void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode, +extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, u64 cookie, u64 version, int flag); extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, -- cgit v1.2.3 From 5f562df5f59340eae4272501b974903f48d2ad92 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:12 -0500 Subject: libceph: format class info at init time An object class method is formatted using a pagelist which contains the class name, the method name, and the data concatenated into an osd request's outbound data. Currently when a class op is initialized in osd_req_op_cls_init(), the lengths of and pointers to these three items are recorded. Later, when the op is getting formatted into the request message, a new pagelist is created and that is when these items get copied into the pagelist. This patch makes it so the pagelist to hold these items is created when the op is initialized instead. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 144d57cbef9e..71c41575646d 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -93,8 +93,9 @@ struct ceph_osd_req_op { const char *class_name; const char *method_name; const void *request_data; - u32 request_data_len; + struct ceph_osd_data *request_info; struct ceph_osd_data *response_data; + u32 request_data_len; __u8 class_len; __u8 method_len; __u8 argc; -- cgit v1.2.3 From a4ce40a9a7c1053ac2a41cf64255e44e356e5522 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:12 -0500 Subject: libceph: combine initializing and setting osd data This ends up being a rather large patch but what it's doing is somewhat straightforward. Basically, this is replacing two calls with one. The first of the two calls is initializing a struct ceph_osd_data with data (either a page array, a page list, or a bio list); the second is setting an osd request op so it associates that data with one of the op's parameters. In place of those two will be a single function that initializes the op directly. That means we sort of fan out a set of the needed functions: - extent ops with pages data - extent ops with pagelist data - extent ops with bio list data and - class ops with page data for receiving a response We also have define another one, but it's only used internally: - class ops with pagelist data for request parameters Note that we *still* haven't gotten rid of the osd request's r_data_in and r_data_out fields. All the osd ops refer to them for their data. For now, these data fields are pointers assigned to the appropriate r_data_* field when these new functions are called. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 43 ++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 71c41575646d..f8a00b48e550 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -240,17 +240,39 @@ extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, u64 truncate_size, u32 truncate_seq); extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, unsigned int which, u64 length); -extern void osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, + +extern struct ceph_osd_data *osd_req_op_extent_osd_data( + struct ceph_osd_request *osd_req, + unsigned int which, bool write_request); +extern struct ceph_osd_data *osd_req_op_cls_response_data( + struct ceph_osd_request *osd_req, + unsigned int which); + +extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, + unsigned int which, bool write_request, + struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, + bool own_pages); +extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *, + unsigned int which, bool write_request, + struct ceph_pagelist *pagelist); +#ifdef CONFIG_BLOCK +extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, + unsigned int which, bool write_request, + struct bio *bio, size_t bio_length); +#endif /* CONFIG_BLOCK */ + +extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, unsigned int which, - struct ceph_osd_data *osd_data); + struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, + bool own_pages); + extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, const char *class, const char *method, const void *request_data, size_t request_data_size); -extern void osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, - unsigned int which, - struct ceph_osd_data *response_data); extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u64 cookie, u64 version, int flag); @@ -290,17 +312,6 @@ static inline void ceph_osdc_put_request(struct ceph_osd_request *req) kref_put(&req->r_kref, ceph_osdc_release_request); } -extern void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, - struct page **pages, u64 length, - u32 alignment, bool pages_from_pool, - bool own_pages); -extern void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, - struct ceph_pagelist *pagelist); -#ifdef CONFIG_BLOCK -extern void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, - struct bio *bio, size_t bio_length); -#endif /* CONFIG_BLOCK */ - extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req, bool nofail); -- cgit v1.2.3 From ec9123c56787fa7fb2608f05b19d21c5e1912d87 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:12 -0500 Subject: libceph: set the data pointers when encoding ops Still using the osd request r_data_in and r_data_out pointer, but we're basically only referring to it via the data pointers in the osd ops. And we're transferring that information to the request or reply message only when the op indicates it's needed, in osd_req_encode_op(). To avoid a forward reference, ceph_osdc_msg_data_set() was moved up in the file. Don't bother calling ceph_osd_data_init(), in ceph_osd_alloc(), because the ops array will already be zeroed anyway. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index f8a00b48e550..dd4ca4ba8cab 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -51,7 +51,7 @@ struct ceph_osd { #define CEPH_OSD_MAX_OP 2 enum ceph_osd_data_type { - CEPH_OSD_DATA_TYPE_NONE, + CEPH_OSD_DATA_TYPE_NONE = 0, CEPH_OSD_DATA_TYPE_PAGES, CEPH_OSD_DATA_TYPE_PAGELIST, #ifdef CONFIG_BLOCK -- cgit v1.2.3 From 5476492fba9fd0b4118aacf5b924dd29b8cca56c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 01:27:12 -0500 Subject: libceph: kill off osd request r_data_in and r_data_out Finally! Convert the osd op data pointers into real structures, and make the switch over to using them instead of having all ops share the in and/or out data structures in the osd request. Set up a new function to traverse the set of ops and release any data associated with them (pages). This and the patches leading up to it resolve: http://tracker.ceph.com/issues/4657 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index dd4ca4ba8cab..4ec46c0ceaf7 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -87,14 +87,14 @@ struct ceph_osd_req_op { u64 offset, length; u64 truncate_size; u32 truncate_seq; - struct ceph_osd_data *osd_data; + struct ceph_osd_data osd_data; } extent; struct { const char *class_name; const char *method_name; const void *request_data; - struct ceph_osd_data *request_info; - struct ceph_osd_data *response_data; + struct ceph_osd_data request_info; + struct ceph_osd_data response_data; u32 request_data_len; __u8 class_len; __u8 method_len; @@ -164,9 +164,6 @@ struct ceph_osd_request { struct ceph_file_layout r_file_layout; struct ceph_snap_context *r_snapc; /* snap context for writes */ - - struct ceph_osd_data r_data_in; - struct ceph_osd_data r_data_out; }; struct ceph_osd_event { -- cgit v1.2.3 From ea96571f7b865edaf1acd472e6f2cddc9fb67892 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 14:46:01 -0500 Subject: libceph: fix possible CONFIG_BLOCK build problem This patch: 15a0d7b libceph: record message data length did not enclose some bio-specific code inside CONFIG_BLOCK as it should have. Fix that. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index b832c0ce899a..cdeebae03e0d 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -271,8 +271,10 @@ extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, size_t length, size_t alignment); extern void ceph_msg_data_set_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist); +#ifdef CONFIG_BLOCK extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio, size_t length); +#endif /* CONFIG_BLOCK */ extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, bool can_fail); -- cgit v1.2.3 From c851c49591ebf000c610711e39eea7da5ff05b21 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 14:46:01 -0500 Subject: libceph: record bio length The bio is the only data item type that doesn't record its full length. Fix that. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index cdeebae03e0d..4fb870a5b5fc 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -116,7 +116,10 @@ struct ceph_msg_data { enum ceph_msg_data_type type; union { #ifdef CONFIG_BLOCK - struct bio *bio; + struct { + struct bio *bio; + size_t bio_length; + }; #endif /* CONFIG_BLOCK */ struct { struct page **pages; /* NOT OWNER. */ -- cgit v1.2.3 From 36153ec9dd6287d7cedf6afb51453c445d946cee Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Mar 2013 14:09:06 -0500 Subject: libceph: move cursor into message A message will only be processing a single data item at a time, so there's no need for each data item to have its own cursor. Move the cursor embedded in the message data structure into the message itself. To minimize the impact, keep the data->cursor field, but make it be a pointer to the cursor in the message. Move the definition of ceph_msg_data above ceph_msg_data_cursor so the cursor can point to the data without a forward definition rather than vice-versa. This and the upcoming patches are part of: http://tracker.ceph.com/issues/3761 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 43 +++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 4fb870a5b5fc..e7557242817c 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -88,6 +88,25 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) } } +struct ceph_msg_data { + enum ceph_msg_data_type type; + union { +#ifdef CONFIG_BLOCK + struct { + struct bio *bio; + size_t bio_length; + }; +#endif /* CONFIG_BLOCK */ + struct { + struct page **pages; /* NOT OWNER. */ + size_t length; /* total # bytes */ + unsigned int alignment; /* first page */ + }; + struct ceph_pagelist *pagelist; + }; + struct ceph_msg_data_cursor *cursor; +}; + struct ceph_msg_data_cursor { size_t resid; /* bytes not yet consumed */ bool last_piece; /* now at last piece of data item */ @@ -112,25 +131,6 @@ struct ceph_msg_data_cursor { }; }; -struct ceph_msg_data { - enum ceph_msg_data_type type; - union { -#ifdef CONFIG_BLOCK - struct { - struct bio *bio; - size_t bio_length; - }; -#endif /* CONFIG_BLOCK */ - struct { - struct page **pages; /* NOT OWNER. */ - size_t length; /* total # bytes */ - unsigned int alignment; /* first page */ - }; - struct ceph_pagelist *pagelist; - }; - struct ceph_msg_data_cursor cursor; /* pagelist only */ -}; - /* * a single message. it contains a header (src, dest, message type, etc.), * footer (crc values, mainly), a "front" message body, and possibly a @@ -142,8 +142,9 @@ struct ceph_msg { struct kvec front; /* unaligned blobs of message */ struct ceph_buffer *middle; - size_t data_length; - struct ceph_msg_data *data; /* data payload */ + size_t data_length; + struct ceph_msg_data *data; + struct ceph_msg_data_cursor cursor; struct ceph_connection *con; struct list_head list_head; /* links for connection lists */ -- cgit v1.2.3 From 8ae4f4f5c056150d5480710ab356801e84d01a3d Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Mar 2013 14:09:06 -0500 Subject: libceph: have cursor point to data Rather than having a ceph message data item point to the cursor it's associated with, have the cursor point to a data item. This will allow a message cursor to be used for more than one data item. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index e7557242817c..8846ff610502 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -104,13 +104,13 @@ struct ceph_msg_data { }; struct ceph_pagelist *pagelist; }; - struct ceph_msg_data_cursor *cursor; }; struct ceph_msg_data_cursor { - size_t resid; /* bytes not yet consumed */ - bool last_piece; /* now at last piece of data item */ - bool need_crc; /* new piece; crc update needed */ + struct ceph_msg_data *data; /* data item this describes */ + size_t resid; /* bytes not yet consumed */ + bool last_piece; /* current is last piece */ + bool need_crc; /* crc update needed */ union { #ifdef CONFIG_BLOCK struct { /* bio */ -- cgit v1.2.3 From 5240d9f95dfe0f0701b35fbff1cb5b70825ad23f Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 14 Mar 2013 14:09:06 -0500 Subject: libceph: replace message data pointer with list In place of the message data pointer, use a list head which links through message data items. For now we only support a single entry on that list. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 8846ff610502..318da0170a1e 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -89,6 +89,7 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) } struct ceph_msg_data { + struct list_head links; /* ceph_msg->data */ enum ceph_msg_data_type type; union { #ifdef CONFIG_BLOCK @@ -143,7 +144,7 @@ struct ceph_msg { struct ceph_buffer *middle; size_t data_length; - struct ceph_msg_data *data; + struct list_head data; struct ceph_msg_data_cursor cursor; struct ceph_connection *con; -- cgit v1.2.3 From ca8b3a69174b04376722672d7dd6b666a7f17c50 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 14:46:01 -0500 Subject: libceph: implement multiple data items in a message This patch adds support to the messenger for more than one data item in its data list. A message data cursor has two more fields to support this: - a count of the number of bytes left to be consumed across all data items in the list, "total_resid" - a pointer to the head of the list (for validation only) The cursor initialization routine has been split into two parts: the outer one, which initializes the cursor for traversing the entire list of data items; and the inner one, which initializes the cursor to start processing a single data item. When a message cursor is first initialized, the outer initialization routine sets total_resid to the length provided. The data pointer is initialized to the first data item on the list. From there, the inner initialization routine finishes by setting up to process the data item the cursor points to. Advancing the cursor consumes bytes in total_resid. If the resid field reaches zero, it means the current data item is fully consumed. If total_resid indicates there is more data, the cursor is advanced to point to the next data item, and then the inner initialization routine prepares for using that. (A check is made at this point to make sure we don't wrap around the front of the list.) The type-specific init routines are modified so they can be given a length that's larger than what the data item can support. The resid field is initialized to the smaller of the provided length and the length of the entire data item. When total_resid reaches zero, we're done. This resolves: http://tracker.ceph.com/issues/3761 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 318da0170a1e..de1d2e1ecce2 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -108,7 +108,10 @@ struct ceph_msg_data { }; struct ceph_msg_data_cursor { - struct ceph_msg_data *data; /* data item this describes */ + size_t total_resid; /* across all data items */ + struct list_head *data_head; /* = &ceph_msg->data */ + + struct ceph_msg_data *data; /* current data item */ size_t resid; /* bytes not yet consumed */ bool last_piece; /* current is last piece */ bool need_crc; /* crc update needed */ -- cgit v1.2.3 From 90af36022aecdeeb1b9c0755461187de717c86dd Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 14:46:01 -0500 Subject: libceph: add, don't set data for a message Change the names of the functions that put data on a pagelist to reflect that we're adding to whatever's already there rather than just setting it to the one thing. Currently only one data item is ever added to a message, but that's about to change. This resolves: http://tracker.ceph.com/issues/2770 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/messenger.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index de1d2e1ecce2..7c1420bb1dce 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -275,12 +275,12 @@ extern void ceph_msg_revoke_incoming(struct ceph_msg *msg); extern void ceph_con_keepalive(struct ceph_connection *con); -extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages, +extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, size_t length, size_t alignment); -extern void ceph_msg_data_set_pagelist(struct ceph_msg *msg, +extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg, struct ceph_pagelist *pagelist); #ifdef CONFIG_BLOCK -extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio, +extern void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, size_t length); #endif /* CONFIG_BLOCK */ -- cgit v1.2.3 From 04017e29bbcf0673d8a6af616c56e395d05f5971 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 5 Apr 2013 14:46:02 -0500 Subject: libceph: make method call data be a separate data item Right now the data for a method call is specified via a pointer and length, and it's copied--along with the class and method name--into a pagelist data item to be sent to the osd. Instead, encode the data in a data item separate from the class and method names. This will allow large amounts of data to be supplied to methods without copying. Only rbd uses the class functionality right now, and when it really needs this it will probably need to use a page array rather than a page list. But this simple implementation demonstrates the functionality on the osd client, and that's enough for now. This resolves: http://tracker.ceph.com/issues/4104 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 4ec46c0ceaf7..2a68a7465c18 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -92,10 +92,9 @@ struct ceph_osd_req_op { struct { const char *class_name; const char *method_name; - const void *request_data; struct ceph_osd_data request_info; + struct ceph_osd_data request_data; struct ceph_osd_data response_data; - u32 request_data_len; __u8 class_len; __u8 method_len; __u8 argc; @@ -259,6 +258,9 @@ extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, struct bio *bio, size_t bio_length); #endif /* CONFIG_BLOCK */ +extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, + unsigned int which, + struct ceph_pagelist *pagelist); extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, unsigned int which, struct page **pages, u64 length, @@ -267,9 +269,7 @@ extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, - const char *class, const char *method, - const void *request_data, - size_t request_data_size); + const char *class, const char *method); extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u64 cookie, u64 version, int flag); -- cgit v1.2.3 From 26be88087ae8a04a5b576aa2f490597b649fc132 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 15 Apr 2013 11:20:42 -0500 Subject: libceph: change how "safe" callback is used An osd request currently has two callbacks. They inform the initiator of the request when we've received confirmation for the target osd that a request was received, and when the osd indicates all changes described by the request are durable. The only time the second callback is used is in the ceph file system for a synchronous write. There's a race that makes some handling of this case unsafe. This patch addresses this problem. The error handling for this callback is also kind of gross, and this patch changes that as well. In ceph_sync_write(), if a safe callback is requested we want to add the request on the ceph inode's unsafe items list. Because items on this list must have their tid set (by ceph_osd_start_request()), the request added *after* the call to that function returns. The problem with this is that there's a race between starting the request and adding it to the unsafe items list; the request may already be complete before ceph_sync_write() even begins to put it on the list. To address this, we change the way the "safe" callback is used. Rather than just calling it when the request is "safe", we use it to notify the initiator the bounds (start and end) of the period during which the request is *unsafe*. So the initiator gets notified just before the request gets sent to the osd (when it is "unsafe"), and again when it's known the results are durable (it's no longer unsafe). The first call will get made in __send_request(), just before the request message gets sent to the messenger for the first time. That function is only called by __send_queued(), which is always called with the osd client's request mutex held. We then have this callback function insert the request on the ceph inode's unsafe list when we're told the request is unsafe. This will avoid the race because this call will be made under protection of the osd client's request mutex. It also nicely groups the setup and cleanup of the state associated with managing unsafe requests. The name of the "safe" callback field is changed to "unsafe" to better reflect its new purpose. It has a Boolean "unsafe" parameter to indicate whether the request is becoming unsafe or is now safe. Because the "msg" parameter wasn't used, we drop that. This resolves the original problem reportedin: http://tracker.ceph.com/issues/4706 Reported-by: Yan, Zheng Signed-off-by: Alex Elder Reviewed-by: Yan, Zheng Reviewed-by: Sage Weil --- include/linux/ceph/osd_client.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 2a68a7465c18..0d3358ef5285 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -29,6 +29,7 @@ struct ceph_authorizer; */ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, struct ceph_msg *); +typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); /* a given osd we're communicating with */ struct ceph_osd { @@ -149,7 +150,8 @@ struct ceph_osd_request { struct kref r_kref; bool r_mempool; struct completion r_completion, r_safe_completion; - ceph_osdc_callback_t r_callback, r_safe_callback; + ceph_osdc_callback_t r_callback; + ceph_osdc_unsafe_callback_t r_unsafe_callback; struct ceph_eversion r_reassert_version; struct list_head r_unsafe_item; -- cgit v1.2.3 From 406e2c9f9286fc93ae2191a7abf477dea05aadc9 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 15 Apr 2013 14:50:36 -0500 Subject: libceph: kill off osd data write_request parameters In the incremental move toward supporting distinct data items in an osd request some of the functions had "write_request" parameters to indicate, basically, whether the data belonged to in_data or the out_data. Now that we maintain the data fields in the op structure there is no need to indicate the direction, so get rid of the "write_request" parameters. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 0d3358ef5285..0e406934a551 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -241,22 +241,22 @@ extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, extern struct ceph_osd_data *osd_req_op_extent_osd_data( struct ceph_osd_request *osd_req, - unsigned int which, bool write_request); + unsigned int which); extern struct ceph_osd_data *osd_req_op_cls_response_data( struct ceph_osd_request *osd_req, unsigned int which); extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, - unsigned int which, bool write_request, + unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages); extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *, - unsigned int which, bool write_request, + unsigned int which, struct ceph_pagelist *pagelist); #ifdef CONFIG_BLOCK extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, - unsigned int which, bool write_request, + unsigned int which, struct bio *bio, size_t bio_length); #endif /* CONFIG_BLOCK */ -- cgit v1.2.3 From 49719778bfa5371ec9b5a7d989bb29000e3ac5df Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 11 Feb 2013 12:33:24 -0600 Subject: libceph: support raw data requests Allow osd request ops that aren't otherwise structured (not class, extent, or watch ops) to specify "raw" data to be used to hold incoming data for the op. Make use of this capability for the osd STAT op. Prefix the name of the private function osd_req_op_init() with "_", and expose a new function by that (earlier) name whose purpose is to initialize osd ops with (only) implied data. For now we'll just support the use of a page array for an osd op with incoming raw data. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 0e406934a551..4d84a2b44f18 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -84,6 +84,7 @@ struct ceph_osd_req_op { u16 op; /* CEPH_OSD_OP_* */ u32 payload_len; union { + struct ceph_osd_data raw_data_in; struct { u64 offset, length; u64 truncate_size; @@ -232,6 +233,15 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); +extern void osd_req_op_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode); + +extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *, + unsigned int which, + struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, + bool own_pages); + extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u64 offset, u64 length, -- cgit v1.2.3 From 6c57b5545d46e276381a15a59283c984cf3f94e3 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 19 Apr 2013 15:34:49 -0500 Subject: libceph: support pages for class request data Add the ability to provide an array of pages as outbound request data for object class method calls. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 4d84a2b44f18..4191cd2c55e5 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -273,6 +273,11 @@ extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, unsigned int which, struct ceph_pagelist *pagelist); +extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *, + unsigned int which, + struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, + bool own_pages); extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, unsigned int which, struct page **pages, u64 length, -- cgit v1.2.3 From b587398a4ff6520753f9a58da294c80ee22443a5 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 19 Apr 2013 15:34:50 -0500 Subject: libceph: add signed type limits Flesh out the limits defined in to include the maximum and minimum values for signed type S8, S16, S32, and S64. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/decode.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index 689f1df37bff..9575a52e011f 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -10,10 +10,20 @@ /* This seemed to be the easiest place to define these */ -#define U8_MAX ((u8) (~0U)) -#define U16_MAX ((u16) (~0U)) -#define U32_MAX ((u32) (~0U)) -#define U64_MAX ((u64) (~0ULL)) +#define U8_MAX ((u8)(~0U)) +#define U16_MAX ((u16)(~0U)) +#define U32_MAX ((u32)(~0U)) +#define U64_MAX ((u64)(~0ULL)) + +#define S8_MAX ((s8)(U8_MAX >> 1)) +#define S16_MAX ((s16)(U16_MAX >> 1)) +#define S32_MAX ((s32)(U32_MAX >> 1)) +#define S64_MAX ((s64)(U64_MAX >> 1LL)) + +#define S8_MIN ((s8)(-S8_MAX - 1)) +#define S16_MIN ((s16)(-S16_MAX - 1)) +#define S32_MIN ((s32)(-S32_MAX - 1)) +#define S64_MIN ((s64)(-S64_MAX - 1LL)) /* * in all cases, -- cgit v1.2.3 From c3f56102f28d90946171ae51753bd417b003fd42 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 19 Apr 2013 15:34:50 -0500 Subject: libceph: validate timespec conversions A ceph timespec contains 32-bit unsigned values for its seconds and nanoseconds components. For a standard timespec, both fields are signed, and the seconds field is almost surely 64 bits. Add some explicit casts so the fact that this conversion is taking place is obvious. Also trip a bug if we ever try to put out of range (negative or too big) values into a ceph timespec. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/decode.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index 9575a52e011f..379f71508995 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -154,14 +154,19 @@ bad: static inline void ceph_decode_timespec(struct timespec *ts, const struct ceph_timespec *tv) { - ts->tv_sec = le32_to_cpu(tv->tv_sec); - ts->tv_nsec = le32_to_cpu(tv->tv_nsec); + ts->tv_sec = (__kernel_time_t)le32_to_cpu(tv->tv_sec); + ts->tv_nsec = (long)le32_to_cpu(tv->tv_nsec); } static inline void ceph_encode_timespec(struct ceph_timespec *tv, const struct timespec *ts) { - tv->tv_sec = cpu_to_le32(ts->tv_sec); - tv->tv_nsec = cpu_to_le32(ts->tv_nsec); + BUG_ON(ts->tv_sec < 0); + BUG_ON(ts->tv_sec > (__kernel_time_t)U32_MAX); + BUG_ON(ts->tv_nsec < 0); + BUG_ON(ts->tv_nsec > (long)U32_MAX); + + tv->tv_sec = cpu_to_le32((u32)ts->tv_sec); + tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec); } /* -- cgit v1.2.3 From 4f0dcb10cf1454a1c38aeaa04cb2757535e4905e Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Tue, 30 Apr 2013 00:44:32 -0500 Subject: libceph: create source file "net/ceph/snapshot.c" This creates a new source file "net/ceph/snapshot.c" to contain utility routines related to ceph snapshot contexts. The main motivation was to define ceph_create_snap_context() as a common way to create these structures, but I've moved the definitions of ceph_get_snap_context() and ceph_put_snap_context() there too. (The benefit of inlining those is very small, and I'd rather keep this collection of functions together.) Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/libceph.h | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 5493d7b86423..2e3024881a5e 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -157,31 +157,11 @@ struct ceph_snap_context { u64 snaps[]; }; -static inline struct ceph_snap_context * -ceph_get_snap_context(struct ceph_snap_context *sc) -{ - /* - printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), - atomic_read(&sc->nref)+1); - */ - if (sc) - atomic_inc(&sc->nref); - return sc; -} - -static inline void ceph_put_snap_context(struct ceph_snap_context *sc) -{ - if (!sc) - return; - /* - printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref), - atomic_read(&sc->nref)-1); - */ - if (atomic_dec_and_test(&sc->nref)) { - /*printk(" deleting snap_context %p\n", sc);*/ - kfree(sc); - } -} +extern struct ceph_snap_context *ceph_create_snap_context(u32 snap_count, + gfp_t gfp_flags); +extern struct ceph_snap_context *ceph_get_snap_context( + struct ceph_snap_context *sc); +extern void ceph_put_snap_context(struct ceph_snap_context *sc); /* * calculate the number of pages a given length and offset map onto, -- cgit v1.2.3 From 5522ae0b68421e2645303ff010e27afc5292e0ab Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Wed, 1 May 2013 12:43:04 -0500 Subject: libceph: use slab cache for osd client requests Create a slab cache to manage allocation of ceph_osdc_request structures. This resolves: http://tracker.ceph.com/issues/3926 Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- include/linux/ceph/osd_client.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/ceph') diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 4191cd2c55e5..186db0bf4951 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -224,6 +224,9 @@ struct ceph_osd_client { struct workqueue_struct *notify_wq; }; +extern int ceph_osdc_setup(void); +extern void ceph_osdc_cleanup(void); + extern int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client); extern void ceph_osdc_stop(struct ceph_osd_client *osdc); -- cgit v1.2.3