From f792685006274a850e6cc0ea9ade275ccdfc90bc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 5 Mar 2013 18:05:46 +0100 Subject: math64: New div64_u64_rem helper Provide an extended version of div64_u64() that also returns the remainder of the division. We are going to need this to refine the cputime scaling code. Signed-off-by: Frederic Weisbecker Cc: Stanislaw Gruszka Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Andrew Morton --- lib/div64.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/div64.c b/lib/div64.c index a163b6caef73..3af5728d95fd 100644 --- a/lib/div64.c +++ b/lib/div64.c @@ -79,9 +79,10 @@ EXPORT_SYMBOL(div_s64_rem); #endif /** - * div64_u64 - unsigned 64bit divide with 64bit divisor + * div64_u64_rem - unsigned 64bit divide with 64bit divisor and 64bit remainder * @dividend: 64bit dividend * @divisor: 64bit divisor + * @remainder: 64bit remainder * * This implementation is a modified version of the algorithm proposed * by the book 'Hacker's Delight'. The original source and full proof @@ -89,27 +90,33 @@ EXPORT_SYMBOL(div_s64_rem); * * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c.txt' */ -#ifndef div64_u64 -u64 div64_u64(u64 dividend, u64 divisor) +#ifndef div64_u64_rem +u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) { u32 high = divisor >> 32; u64 quot; if (high == 0) { - quot = div_u64(dividend, divisor); + u32 rem32; + quot = div_u64_rem(dividend, divisor, &rem32); + *remainder = rem32; } else { int n = 1 + fls(high); quot = div_u64(dividend >> n, divisor >> n); if (quot != 0) quot--; - if ((dividend - quot * divisor) >= divisor) + + *remainder = dividend - quot * divisor; + if (*remainder >= divisor) { quot++; + *remainder -= divisor; + } } return quot; } -EXPORT_SYMBOL(div64_u64); +EXPORT_SYMBOL(div64_u64_rem); #endif /** -- cgit v1.2.3 From cbe5e6109538ddab57764a88d9f0c2accd0c7d48 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 22 Mar 2013 22:17:36 -0600 Subject: lru_cache: introduce lc_get_cumulative() New helper to be able to consolidate more updates into a single transaction. Without this, we can only grab a single refcount on an updated element while preparing a transaction. lc_get_cumulative - like lc_get; also finds to-be-changed elements @lc: the lru cache to operate on @enr: the label to look up Unlike lc_get this also returns the element for @enr, if it is belonging to a pending transaction, so the return values are like for lc_get(), plus: pointer to an element already on the "to_be_changed" list. In this case, the cache was already marked %LC_DIRTY. Caller needs to make sure that the pending transaction is completed, before proceeding to actually use this element. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Fixed up by Jens to export lc_get_cumulative(). Signed-off-by: Jens Axboe --- lib/lru_cache.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/lru_cache.c b/lib/lru_cache.c index 8335d39d2ccd..4a83ecd03650 100644 --- a/lib/lru_cache.c +++ b/lib/lru_cache.c @@ -365,7 +365,13 @@ static int lc_unused_element_available(struct lru_cache *lc) return 0; } -static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change) +/* used as internal flags to __lc_get */ +enum { + LC_GET_MAY_CHANGE = 1, + LC_GET_MAY_USE_UNCOMMITTED = 2, +}; + +static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, unsigned int flags) { struct lc_element *e; @@ -380,22 +386,31 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool * this enr is currently being pulled in already, * and will be available once the pending transaction * has been committed. */ - if (e && e->lc_new_number == e->lc_number) { + if (e) { + if (e->lc_new_number != e->lc_number) { + /* It has been found above, but on the "to_be_changed" + * list, not yet committed. Don't pull it in twice, + * wait for the transaction, then try again... + */ + if (!(flags & LC_GET_MAY_USE_UNCOMMITTED)) + RETURN(NULL); + /* ... unless the caller is aware of the implications, + * probably preparing a cumulative transaction. */ + ++e->refcnt; + ++lc->hits; + RETURN(e); + } + /* else: lc_new_number == lc_number; a real hit. */ ++lc->hits; if (e->refcnt++ == 0) lc->used++; list_move(&e->list, &lc->in_use); /* Not evictable... */ RETURN(e); } + /* e == NULL */ ++lc->misses; - if (!may_change) - RETURN(NULL); - - /* It has been found above, but on the "to_be_changed" list, not yet - * committed. Don't pull it in twice, wait for the transaction, then - * try again */ - if (e) + if (!(flags & LC_GET_MAY_CHANGE)) RETURN(NULL); /* To avoid races with lc_try_lock(), first, mark us dirty @@ -477,7 +492,27 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool */ struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) { - return __lc_get(lc, enr, 1); + return __lc_get(lc, enr, LC_GET_MAY_CHANGE); +} + +/** + * lc_get_cumulative - like lc_get; also finds to-be-changed elements + * @lc: the lru cache to operate on + * @enr: the label to look up + * + * Unlike lc_get this also returns the element for @enr, if it is belonging to + * a pending transaction, so the return values are like for lc_get(), + * plus: + * + * pointer to an element already on the "to_be_changed" list. + * In this case, the cache was already marked %LC_DIRTY. + * + * Caller needs to make sure that the pending transaction is completed, + * before proceeding to actually use this element. + */ +struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr) +{ + return __lc_get(lc, enr, LC_GET_MAY_CHANGE|LC_GET_MAY_USE_UNCOMMITTED); } /** @@ -648,3 +683,4 @@ EXPORT_SYMBOL(lc_seq_printf_stats); EXPORT_SYMBOL(lc_seq_dump_details); EXPORT_SYMBOL(lc_try_lock); EXPORT_SYMBOL(lc_is_used); +EXPORT_SYMBOL(lc_get_cumulative); -- cgit v1.2.3 From 2db76d7c3c6db93058f983c8240f7c7c25e87ee6 Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Tue, 26 Mar 2013 15:14:18 +0200 Subject: lib/scatterlist: sg_page_iter: support sg lists w/o backing pages The i915 driver uses sg lists for memory without backing 'struct page' pages, similarly to other IO memory regions, setting only the DMA address for these. It does this, so that it can program the HW MMU tables in a uniform way both for sg lists with and without backing pages. Without a valid page pointer we can't call nth_page to get the current page in __sg_page_iter_next, so add a helper that relevant users can call separately. Also add a helper to get the DMA address of the current page (idea from Daniel). Convert all places in i915, to use the new API. Signed-off-by: Imre Deak Reviewed-by: Damien Lespiau Signed-off-by: Daniel Vetter --- lib/scatterlist.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/scatterlist.c b/lib/scatterlist.c index b83c144d731f..a1cf8cae60e7 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -401,7 +401,6 @@ void __sg_page_iter_start(struct sg_page_iter *piter, piter->__pg_advance = 0; piter->__nents = nents; - piter->page = NULL; piter->sg = sglist; piter->sg_pgoffset = pgoffset; } @@ -426,7 +425,6 @@ bool __sg_page_iter_next(struct sg_page_iter *piter) if (!--piter->__nents || !piter->sg) return false; } - piter->page = nth_page(sg_page(piter->sg), piter->sg_pgoffset); return true; } @@ -496,7 +494,7 @@ bool sg_miter_next(struct sg_mapping_iter *miter) miter->__remaining = min_t(unsigned long, miter->__remaining, PAGE_SIZE - miter->__offset); } - miter->page = miter->piter.page; + miter->page = sg_page_iter_page(&miter->piter); miter->consumed = miter->length = miter->__remaining; if (miter->__flags & SG_MITER_ATOMIC) -- cgit v1.2.3 From 0ecc833bac594099505a090cbca6ccd5b83d5975 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 29 Mar 2013 12:23:28 -0400 Subject: mode_t, whack-a-mole at 11... Signed-off-by: Al Viro --- lib/notifier-error-inject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/notifier-error-inject.c b/lib/notifier-error-inject.c index 44b92cb6224f..eb4a04afea80 100644 --- a/lib/notifier-error-inject.c +++ b/lib/notifier-error-inject.c @@ -17,7 +17,7 @@ static int debugfs_errno_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(fops_errno, debugfs_errno_get, debugfs_errno_set, "%lld\n"); -static struct dentry *debugfs_create_errno(const char *name, mode_t mode, +static struct dentry *debugfs_create_errno(const char *name, umode_t mode, struct dentry *parent, int *value) { return debugfs_create_file(name, mode, parent, value, &fops_errno); @@ -50,7 +50,7 @@ struct dentry *notifier_err_inject_init(const char *name, struct dentry *parent, struct notifier_err_inject *err_inject, int priority) { struct notifier_err_inject_action *action; - mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; struct dentry *actions_dir; -- cgit v1.2.3 From a49b7e82cab0f9b41f483359be83f44fbb6b4979 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 13 Apr 2013 15:15:30 -0700 Subject: kobject: fix kset_find_obj() race with concurrent last kobject_put() Anatol Pomozov identified a race condition that hits module unloading and re-loading. To quote Anatol: "This is a race codition that exists between kset_find_obj() and kobject_put(). kset_find_obj() might return kobject that has refcount equal to 0 if this kobject is freeing by kobject_put() in other thread. Here is timeline for the crash in case if kset_find_obj() searches for an object tht nobody holds and other thread is doing kobject_put() on the same kobject: THREAD A (calls kset_find_obj()) THREAD B (calls kobject_put()) splin_lock() atomic_dec_return(kobj->kref), counter gets zero here ... starts kobject cleanup .... spin_lock() // WAIT thread A in kobj_kset_leave() iterate over kset->list atomic_inc(kobj->kref) (counter becomes 1) spin_unlock() spin_lock() // taken // it does not know that thread A increased counter so it remove obj from list spin_unlock() vfree(module) // frees module object with containing kobj // kobj points to freed memory area!! kobject_put(kobj) // OOPS!!!! The race above happens because module.c tries to use kset_find_obj() when somebody unloads module. The module.c code was introduced in commit 6494a93d55fa" Anatol supplied a patch specific for module.c that worked around the problem by simply not using kset_find_obj() at all, but rather than make a local band-aid, this just fixes kset_find_obj() to be thread-safe using the proper model of refusing the get a new reference if the refcount has already dropped to zero. See examples of this proper refcount handling not only in the kref documentation, but in various other equivalent uses of this pattern by grepping for atomic_inc_not_zero(). [ Side note: the module race does indicate that module loading and unloading is not properly serialized wrt sysfs information using the module mutex. That may require further thought, but this is the correct fix at the kobject layer regardless. ] Reported-analyzed-and-tested-by: Anatol Pomozov Cc: Greg Kroah-Hartman Cc: Al Viro Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- lib/kobject.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/kobject.c b/lib/kobject.c index e07ee1fcd6f1..a65486613d79 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -529,6 +529,13 @@ struct kobject *kobject_get(struct kobject *kobj) return kobj; } +static struct kobject *kobject_get_unless_zero(struct kobject *kobj) +{ + if (!kref_get_unless_zero(&kobj->kref)) + kobj = NULL; + return kobj; +} + /* * kobject_cleanup - free kobject resources. * @kobj: object to cleanup @@ -751,7 +758,7 @@ struct kobject *kset_find_obj(struct kset *kset, const char *name) list_for_each_entry(k, &kset->list, entry) { if (kobject_name(k) && !strcmp(kobject_name(k), name)) { - ret = kobject_get(k); + ret = kobject_get_unless_zero(k); break; } } -- cgit v1.2.3 From 0635eb8a54cf0fea64b174bb68bc36b9c3d622db Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 15 Apr 2013 13:09:45 -0700 Subject: Move utf16 functions to kernel core and rename We want to be able to use the utf16 functions that are currently present in the EFI variables code in platform-specific code as well. Move them to the kernel core, and in the process rename them to accurately describe what they do - they don't handle UTF16, only UCS2. Signed-off-by: Matthew Garrett Signed-off-by: Matt Fleming --- lib/Kconfig | 3 +++ lib/Makefile | 2 ++ lib/ucs2_string.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) create mode 100644 lib/ucs2_string.c (limited to 'lib') diff --git a/lib/Kconfig b/lib/Kconfig index 3958dc4389f9..fe01d418b09a 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -404,4 +404,7 @@ config OID_REGISTRY help Enable fast lookup object identifier registry. +config UCS2_STRING + tristate + endmenu diff --git a/lib/Makefile b/lib/Makefile index d7946ff75b2e..6e2cc561f761 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -174,3 +174,5 @@ quiet_cmd_build_OID_registry = GEN $@ cmd_build_OID_registry = perl $(srctree)/$(src)/build_OID_registry $< $@ clean-files += oid_registry_data.c + +obj-$(CONFIG_UCS2_STRING) += ucs2_string.o diff --git a/lib/ucs2_string.c b/lib/ucs2_string.c new file mode 100644 index 000000000000..6f500ef2301d --- /dev/null +++ b/lib/ucs2_string.c @@ -0,0 +1,51 @@ +#include +#include + +/* Return the number of unicode characters in data */ +unsigned long +ucs2_strnlen(const ucs2_char_t *s, size_t maxlength) +{ + unsigned long length = 0; + + while (*s++ != 0 && length < maxlength) + length++; + return length; +} +EXPORT_SYMBOL(ucs2_strnlen); + +unsigned long +ucs2_strlen(const ucs2_char_t *s) +{ + return ucs2_strnlen(s, ~0UL); +} +EXPORT_SYMBOL(ucs2_strlen); + +/* + * Return the number of bytes is the length of this string + * Note: this is NOT the same as the number of unicode characters + */ +unsigned long +ucs2_strsize(const ucs2_char_t *data, unsigned long maxlength) +{ + return ucs2_strnlen(data, maxlength/sizeof(ucs2_char_t)) * sizeof(ucs2_char_t); +} +EXPORT_SYMBOL(ucs2_strsize); + +int +ucs2_strncmp(const ucs2_char_t *a, const ucs2_char_t *b, size_t len) +{ + while (1) { + if (len == 0) + return 0; + if (*a < *b) + return -1; + if (*a > *b) + return 1; + if (*a == 0) /* implies *b == 0 */ + return 0; + a++; + b++; + len--; + } +} +EXPORT_SYMBOL(ucs2_strncmp); -- cgit v1.2.3 From c729de8fcea37a1c444e81857eace12494c804a9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 15 Apr 2013 22:23:45 -0700 Subject: x86, kdump: Set crashkernel_low automatically Chao said that kdump does does work well on his system on 3.8 without extra parameter, even iommu does not work with kdump. And now have to append crashkernel_low=Y in first kernel to make kdump work. We have now modified crashkernel=X to allocate memory beyong 4G (if available) and do not allocate low range for crashkernel if the user does not specify that with crashkernel_low=Y. This causes regression if iommu is not enabled. Without iommu, swiotlb needs to be setup in first 4G and there is no low memory available to second kernel. Set crashkernel_low automatically if the user does not specify that. For system that does support IOMMU with kdump properly, user could specify crashkernel_low=0 to save that 72M low ram. -v3: add swiotlb_size() according to Konrad. -v4: add comments what 8M is for according to hpa. also update more crashkernel_low= in kernel-parameters.txt -v5: update changelog according to Vivek. -v6: Change description about swiotlb referring according to HATAYAMA. Reported-by: WANG Chao Tested-by: WANG Chao Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1366089828-19692-2-git-send-email-yinghai@kernel.org Acked-by: Vivek Goyal Signed-off-by: H. Peter Anvin --- lib/swiotlb.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/swiotlb.c b/lib/swiotlb.c index bfe02b8fc55b..d23762e6652c 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -105,9 +105,9 @@ setup_io_tlb_npages(char *str) if (!strcmp(str, "force")) swiotlb_force = 1; - return 1; + return 0; } -__setup("swiotlb=", setup_io_tlb_npages); +early_param("swiotlb", setup_io_tlb_npages); /* make io_tlb_overflow tunable too? */ unsigned long swiotlb_nr_tbl(void) @@ -115,6 +115,18 @@ unsigned long swiotlb_nr_tbl(void) return io_tlb_nslabs; } EXPORT_SYMBOL_GPL(swiotlb_nr_tbl); + +/* default to 64MB */ +#define IO_TLB_DEFAULT_SIZE (64UL<<20) +unsigned long swiotlb_size_or_default(void) +{ + unsigned long size; + + size = io_tlb_nslabs << IO_TLB_SHIFT; + + return size ? size : (IO_TLB_DEFAULT_SIZE); +} + /* Note that this doesn't work with highmem page */ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, volatile void *address) @@ -188,8 +200,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) void __init swiotlb_init(int verbose) { - /* default to 64MB */ - size_t default_size = 64UL<<20; + size_t default_size = IO_TLB_DEFAULT_SIZE; unsigned char *vstart; unsigned long bytes; -- cgit v1.2.3 From 4b59e6c4730978679b414a8da61514a2518da512 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 29 Apr 2013 15:06:11 -0700 Subject: mm, show_mem: suppress page counts in non-blockable contexts On large systems with a lot of memory, walking all RAM to determine page types may take a half second or even more. In non-blockable contexts, the page allocator will emit a page allocation failure warning unless __GFP_NOWARN is specified. In such contexts, irqs are typically disabled and such a lengthy delay may even result in NMI watchdog timeouts. To fix this, suppress the page walk in such contexts when printing the page allocation failure warning. Signed-off-by: David Rientjes Cc: Mel Gorman Acked-by: Michal Hocko Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/show_mem.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'lib') diff --git a/lib/show_mem.c b/lib/show_mem.c index 4407f8c9b1f7..b7c72311ad0c 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -18,6 +18,9 @@ void show_mem(unsigned int filter) printk("Mem-Info:\n"); show_free_areas(filter); + if (filter & SHOW_MEM_FILTER_PAGE_COUNT) + return; + for_each_online_pgdat(pgdat) { unsigned long i, flags; -- cgit v1.2.3 From 9375db07adeaeea5f5ea7ca0463a8b371d71ddbb Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Mon, 29 Apr 2013 16:17:10 -0700 Subject: genalloc: add devres support, allow to find a managed pool by device This patch adds three exported functions to lib/genalloc.c: devm_gen_pool_create, dev_get_gen_pool, and of_get_named_gen_pool. devm_gen_pool_create is a managed version of gen_pool_create that keeps track of the pool via devres and allows the management code to automatically destroy it after device removal. dev_get_gen_pool retrieves the gen_pool for a given device, if it was created with devm_gen_pool_create, using devres_find. of_get_named_gen_pool retrieves the gen_pool for a given device node and property name, where the property must contain a phandle pointing to a platform device node. The corresponding platform device is then fed into dev_get_gen_pool and the resulting gen_pool is returned. [akpm@linux-foundation.org: make the of_get_named_gen_pool() stub static, fixing a zillion link errors] [akpm@linux-foundation.org: squish "struct device declared inside parameter list" warning] Signed-off-by: Philipp Zabel Acked-by: Grant Likely Tested-by: Michal Simek Cc: Fabio Estevam Cc: Matt Porter Cc: Dong Aisheng Cc: Greg Kroah-Hartman Cc: Rob Herring Cc: Paul Gortmaker Cc: Javier Martin Cc: Huang Shijie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/genalloc.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) (limited to 'lib') diff --git a/lib/genalloc.c b/lib/genalloc.c index 54920433705a..b35cfa9bc3d4 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -34,6 +34,8 @@ #include #include #include +#include +#include static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set) { @@ -480,3 +482,82 @@ unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, return start_bit; } EXPORT_SYMBOL(gen_pool_best_fit); + +static void devm_gen_pool_release(struct device *dev, void *res) +{ + gen_pool_destroy(*(struct gen_pool **)res); +} + +/** + * devm_gen_pool_create - managed gen_pool_create + * @dev: device that provides the gen_pool + * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents + * @nid: node id of the node the pool structure should be allocated on, or -1 + * + * Create a new special memory pool that can be used to manage special purpose + * memory not managed by the regular kmalloc/kfree interface. The pool will be + * automatically destroyed by the device management code. + */ +struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, + int nid) +{ + struct gen_pool **ptr, *pool; + + ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL); + + pool = gen_pool_create(min_alloc_order, nid); + if (pool) { + *ptr = pool; + devres_add(dev, ptr); + } else { + devres_free(ptr); + } + + return pool; +} + +/** + * dev_get_gen_pool - Obtain the gen_pool (if any) for a device + * @dev: device to retrieve the gen_pool from + * @name: Optional name for the gen_pool, usually NULL + * + * Returns the gen_pool for the device if one is present, or NULL. + */ +struct gen_pool *dev_get_gen_pool(struct device *dev) +{ + struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL, + NULL); + + if (!p) + return NULL; + return *p; +} +EXPORT_SYMBOL_GPL(dev_get_gen_pool); + +#ifdef CONFIG_OF +/** + * of_get_named_gen_pool - find a pool by phandle property + * @np: device node + * @propname: property name containing phandle(s) + * @index: index into the phandle array + * + * Returns the pool that contains the chunk starting at the physical + * address of the device tree node pointed at by the phandle property, + * or NULL if not found. + */ +struct gen_pool *of_get_named_gen_pool(struct device_node *np, + const char *propname, int index) +{ + struct platform_device *pdev; + struct device_node *np_pool; + + np_pool = of_parse_phandle(np, propname, index); + if (!np_pool) + return NULL; + pdev = of_find_device_by_node(np_pool); + if (!pdev) + return NULL; + return dev_get_gen_pool(&pdev->dev); +} +EXPORT_SYMBOL_GPL(of_get_named_gen_pool); +#endif /* CONFIG_OF */ -- cgit v1.2.3 From 30493cc9dddb68066dcc4878015660fdaa8e0965 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 29 Apr 2013 16:18:09 -0700 Subject: lib/int_sqrt.c: optimize square root algorithm Optimize the current version of the shift-and-subtract (hardware) algorithm, described by John von Newmann[1] and Guy L Steele. Iterating 1,000,000 times, perf shows for the current version: Performance counter stats for './sqrt-curr' (10 runs): 27.170996 task-clock # 0.979 CPUs utilized ( +- 3.19% ) 3 context-switches # 0.103 K/sec ( +- 4.76% ) 0 cpu-migrations # 0.004 K/sec ( +-100.00% ) 104 page-faults # 0.004 M/sec ( +- 0.16% ) 64,921,199 cycles # 2.389 GHz ( +- 0.03% ) 28,967,789 stalled-cycles-frontend # 44.62% frontend cycles idle ( +- 0.18% ) stalled-cycles-backend 104,502,623 instructions # 1.61 insns per cycle # 0.28 stalled cycles per insn ( +- 0.00% ) 34,088,368 branches # 1254.587 M/sec ( +- 0.00% ) 4,901 branch-misses # 0.01% of all branches ( +- 1.32% ) 0.027763015 seconds time elapsed ( +- 3.22% ) And for the new version: Performance counter stats for './sqrt-new' (10 runs): 0.496869 task-clock # 0.519 CPUs utilized ( +- 2.38% ) 0 context-switches # 0.000 K/sec 0 cpu-migrations # 0.403 K/sec ( +-100.00% ) 104 page-faults # 0.209 M/sec ( +- 0.15% ) 590,760 cycles # 1.189 GHz ( +- 2.35% ) 395,053 stalled-cycles-frontend # 66.87% frontend cycles idle ( +- 3.67% ) stalled-cycles-backend 398,963 instructions # 0.68 insns per cycle # 0.99 stalled cycles per insn ( +- 0.39% ) 70,228 branches # 141.341 M/sec ( +- 0.36% ) 3,364 branch-misses # 4.79% of all branches ( +- 5.45% ) 0.000957440 seconds time elapsed ( +- 2.42% ) Furthermore, this saves space in instruction text: text data bss dec hex filename 111 0 0 111 6f lib/int_sqrt-baseline.o 89 0 0 89 59 lib/int_sqrt.o [1] http://en.wikipedia.org/wiki/First_Draft_of_a_Report_on_the_EDVAC Signed-off-by: Davidlohr Bueso Reviewed-by: Jonathan Gonzalez Tested-by: Jonathan Gonzalez Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/int_sqrt.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'lib') diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c index fc2eeb7cb2ea..1ef4cc344977 100644 --- a/lib/int_sqrt.c +++ b/lib/int_sqrt.c @@ -1,3 +1,9 @@ +/* + * Copyright (C) 2013 Davidlohr Bueso + * + * Based on the shift-and-subtract algorithm for computing integer + * square root from Guy L. Steele. + */ #include #include @@ -10,23 +16,23 @@ */ unsigned long int_sqrt(unsigned long x) { - unsigned long op, res, one; + unsigned long b, m, y = 0; - op = x; - res = 0; + if (x <= 1) + return x; - one = 1UL << (BITS_PER_LONG - 2); - while (one > op) - one >>= 2; + m = 1UL << (BITS_PER_LONG - 2); + while (m != 0) { + b = y + m; + y >>= 1; - while (one != 0) { - if (op >= res + one) { - op = op - (res + one); - res = res + 2 * one; + if (x >= b) { + x -= b; + y += m; } - res /= 2; - one /= 4; + m >>= 2; } - return res; + + return y; } EXPORT_SYMBOL(int_sqrt); -- cgit v1.2.3 From 095d141b2e40665289d44a42d387ffac2841ef82 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 29 Apr 2013 16:18:10 -0700 Subject: argv_split(): teach it to handle mutable strings argv_split() allocates argv[count_argc(str)] array and assumes that it will find the same number of arguments later. This is obviously wrong if this string can be changed, say, by sysctl. With this patch argv_split() kstrndup's the whole string and does not split it, we simply replace the spaces with zeroes and keep the allocated memory in argv[-1] for argv_free(arg). We do not use argv[0] because: - str can be all-spaces or empty. In fact this case is fine, we could kfree() it before return, but: - str can have a space at the start, and we can not rely on kstrndup(skip_spaces(str)) because it can equally race if this string is mutable. Also, simplify count_argc() and kill the no longer used skip_arg(). Signed-off-by: Oleg Nesterov Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/argv_split.c | 87 ++++++++++++++++++++++++++------------------------------ 1 file changed, 40 insertions(+), 47 deletions(-) (limited to 'lib') diff --git a/lib/argv_split.c b/lib/argv_split.c index 1e9a6cbc3689..e927ed0e18a8 100644 --- a/lib/argv_split.c +++ b/lib/argv_split.c @@ -8,23 +8,17 @@ #include #include -static const char *skip_arg(const char *cp) -{ - while (*cp && !isspace(*cp)) - cp++; - - return cp; -} - static int count_argc(const char *str) { int count = 0; + bool was_space; - while (*str) { - str = skip_spaces(str); - if (*str) { + for (was_space = true; *str; str++) { + if (isspace(*str)) { + was_space = true; + } else if (was_space) { + was_space = false; count++; - str = skip_arg(str); } } @@ -39,10 +33,8 @@ static int count_argc(const char *str) */ void argv_free(char **argv) { - char **p; - for (p = argv; *p; p++) - kfree(*p); - + argv--; + kfree(argv[0]); kfree(argv); } EXPORT_SYMBOL(argv_free); @@ -59,43 +51,44 @@ EXPORT_SYMBOL(argv_free); * considered to be a single argument separator. The returned array * is always NULL-terminated. Returns NULL on memory allocation * failure. + * + * The source string at `str' may be undergoing concurrent alteration via + * userspace sysctl activity (at least). The argv_split() implementation + * attempts to handle this gracefully by taking a local copy to work on. */ char **argv_split(gfp_t gfp, const char *str, int *argcp) { - int argc = count_argc(str); - char **argv = kzalloc(sizeof(*argv) * (argc+1), gfp); - char **argvp; - - if (argv == NULL) - goto out; - - if (argcp) - *argcp = argc; - - argvp = argv; - - while (*str) { - str = skip_spaces(str); - - if (*str) { - const char *p = str; - char *t; - - str = skip_arg(str); + char *argv_str; + bool was_space; + char **argv, **argv_ret; + int argc; + + argv_str = kstrndup(str, KMALLOC_MAX_SIZE - 1, gfp); + if (!argv_str) + return NULL; + + argc = count_argc(argv_str); + argv = kmalloc(sizeof(*argv) * (argc + 2), gfp); + if (!argv) { + kfree(argv_str); + return NULL; + } - t = kstrndup(p, str-p, gfp); - if (t == NULL) - goto fail; - *argvp++ = t; + *argv = argv_str; + argv_ret = ++argv; + for (was_space = true; *argv_str; argv_str++) { + if (isspace(*argv_str)) { + was_space = true; + *argv_str = 0; + } else if (was_space) { + was_space = false; + *argv++ = argv_str; } } - *argvp = NULL; - - out: - return argv; + *argv = NULL; - fail: - argv_free(argv); - return NULL; + if (argcp) + *argcp = argc; + return argv_ret; } EXPORT_SYMBOL(argv_split); -- cgit v1.2.3 From 2e0fb404c86d6c86dc33e284310eb5d28d192dcf Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 29 Apr 2013 16:18:11 -0700 Subject: lib, net: make isodigit() public and use it There are at least two users of isodigit(). Let's make it a public function of ctype.h. Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/dynamic_debug.c | 1 - 1 file changed, 1 deletion(-) (limited to 'lib') diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 5276b99ca650..46032453abd5 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -281,7 +281,6 @@ static inline int parse_lineno(const char *str, unsigned int *val) * allow the user to express a query which matches a format * containing embedded spaces. */ -#define isodigit(c) ((c) >= '0' && (c) <= '7') static char *unescape(char *str) { char *in = str; -- cgit v1.2.3 From 3e6628c4b347a558965041290c5a92791dd4c741 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 29 Apr 2013 16:21:16 -0700 Subject: idr: introduce idr_alloc_cyclic() As Tejun points out, there are several users of the IDR facility that attempt to use it in a cyclic fashion. These users are likely to see -ENOSPC errors after the counter wraps one or more times however. This patchset adds a new idr_alloc_cyclic routine and converts several of these users to it. Many of these users are in obscure parts of the kernel, and I don't have a good way to test some of them. The change is pretty straightforward though, so hopefully it won't be an issue. There is one other cyclic user of idr_alloc that I didn't touch in ipc/util.c. That one is doing some strange stuff that I didn't quite understand, but it looks like it should probably be converted later somehow. This patch: Thus spake Tejun Heo: Ooh, BTW, the cyclic allocation is broken. It's prone to -ENOSPC after the first wraparound. There are several cyclic users in the kernel and I think it probably would be best to implement cyclic support in idr. This patch does that by adding new idr_alloc_cyclic function that such users in the kernel can use. With this, there's no need for a caller to keep track of the last value used as that's now tracked internally. This should prevent the ENOSPC problems that can hit when the "last allocated" counter exceeds INT_MAX. Later patches will convert existing cyclic users to the new interface. Signed-off-by: Jeff Layton Reviewed-by: Tejun Heo Cc: "David S. Miller" Cc: "J. Bruce Fields" Cc: Eric Paris Cc: Jack Morgenstein Cc: John McCutchan Cc: Neil Horman Cc: Or Gerlitz Cc: Robert Love Cc: Roland Dreier Cc: Sridhar Samudrala Cc: Steve Wise Cc: Tom Tucker Cc: Vlad Yasevich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/idr.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'lib') diff --git a/lib/idr.c b/lib/idr.c index 322e2816f2fb..cca4b9302a71 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -495,6 +495,33 @@ int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask) } EXPORT_SYMBOL_GPL(idr_alloc); +/** + * idr_alloc_cyclic - allocate new idr entry in a cyclical fashion + * @idr: the (initialized) idr + * @ptr: pointer to be associated with the new id + * @start: the minimum id (inclusive) + * @end: the maximum id (exclusive, <= 0 for max) + * @gfp_mask: memory allocation flags + * + * Essentially the same as idr_alloc, but prefers to allocate progressively + * higher ids if it can. If the "cur" counter wraps, then it will start again + * at the "start" end of the range and allocate one that has already been used. + */ +int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, + gfp_t gfp_mask) +{ + int id; + + id = idr_alloc(idr, ptr, max(start, idr->cur), end, gfp_mask); + if (id == -ENOSPC) + id = idr_alloc(idr, ptr, start, end, gfp_mask); + + if (likely(id >= 0)) + idr->cur = id + 1; + return id; +} +EXPORT_SYMBOL(idr_alloc_cyclic); + static void idr_remove_warning(int id) { printk(KERN_WARNING -- cgit v1.2.3 From cedddb00023b1321f2bb2739739aa8c9927fc063 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 29 Apr 2013 16:21:25 -0700 Subject: uuid: use prandom_bytes() Use prandom_bytes() to generate 16 bytes of pseudo-random bytes. Signed-off-by: Akinobu Mita Cc: "Theodore Ts'o" Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/uuid.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/uuid.c b/lib/uuid.c index 52a6fe6387de..398821e4dce1 100644 --- a/lib/uuid.c +++ b/lib/uuid.c @@ -25,13 +25,7 @@ static void __uuid_gen_common(__u8 b[16]) { - int i; - u32 r; - - for (i = 0; i < 4; i++) { - r = random32(); - memcpy(b + i * 4, &r, 4); - } + prandom_bytes(b, 16); /* reversion 0b10 */ b[8] = (b[8] & 0x3F) | 0x80; } -- cgit v1.2.3 From f39fee5f11f7adfcf5c9643f87718b80450be18a Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 29 Apr 2013 16:21:28 -0700 Subject: lib/: rename random32() to prandom_u32() Use preferable function name which implies using a pseudo-random number generator. Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/fault-inject.c | 2 +- lib/list_sort.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/fault-inject.c b/lib/fault-inject.c index f7210ad6cffd..c5c7a762b850 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -122,7 +122,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size) return false; } - if (attr->probability <= random32() % 100) + if (attr->probability <= prandom_u32() % 100) return false; if (!fail_stacktrace(attr)) diff --git a/lib/list_sort.c b/lib/list_sort.c index d7325c6b103f..1183fa70a44d 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -229,7 +229,7 @@ static int __init list_sort_test(void) goto exit; } /* force some equivalencies */ - el->value = random32() % (TEST_LIST_LEN/3); + el->value = prandom_u32() % (TEST_LIST_LEN / 3); el->serial = i; el->poison1 = TEST_POISON1; el->poison2 = TEST_POISON2; -- cgit v1.2.3 From f3002134158092178be81339ec5a22ff80e6c308 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 30 Apr 2013 11:35:07 +0200 Subject: Revert "math64: New div64_u64_rem helper" This reverts commit f792685006274a850e6cc0ea9ade275ccdfc90bc. The cputime scaling code was changed/fixed and does not need the div64_u64_rem() primitive anymore. It has no other users, so let's remove them. Signed-off-by: Stanislaw Gruszka Cc: Frederic Weisbecker Cc: rostedt@goodmis.org Cc: Linus Torvalds Cc: Dave Hansen Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1367314507-9728-4-git-send-email-sgruszka@redhat.com Signed-off-by: Ingo Molnar --- lib/div64.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'lib') diff --git a/lib/div64.c b/lib/div64.c index 3af5728d95fd..a163b6caef73 100644 --- a/lib/div64.c +++ b/lib/div64.c @@ -79,10 +79,9 @@ EXPORT_SYMBOL(div_s64_rem); #endif /** - * div64_u64_rem - unsigned 64bit divide with 64bit divisor and 64bit remainder + * div64_u64 - unsigned 64bit divide with 64bit divisor * @dividend: 64bit dividend * @divisor: 64bit divisor - * @remainder: 64bit remainder * * This implementation is a modified version of the algorithm proposed * by the book 'Hacker's Delight'. The original source and full proof @@ -90,33 +89,27 @@ EXPORT_SYMBOL(div_s64_rem); * * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c.txt' */ -#ifndef div64_u64_rem -u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) +#ifndef div64_u64 +u64 div64_u64(u64 dividend, u64 divisor) { u32 high = divisor >> 32; u64 quot; if (high == 0) { - u32 rem32; - quot = div_u64_rem(dividend, divisor, &rem32); - *remainder = rem32; + quot = div_u64(dividend, divisor); } else { int n = 1 + fls(high); quot = div_u64(dividend >> n, divisor >> n); if (quot != 0) quot--; - - *remainder = dividend - quot * divisor; - if (*remainder >= divisor) { + if ((dividend - quot * divisor) >= divisor) quot++; - *remainder -= divisor; - } } return quot; } -EXPORT_SYMBOL(div64_u64_rem); +EXPORT_SYMBOL(div64_u64); #endif /** -- cgit v1.2.3 From b0d33c2bd77bcf2d7c9427d2361ac57fe5b33aa1 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 12 Dec 2012 10:18:50 -0800 Subject: vsprintf: Add extension %pSR - print_symbol replacement print_symbol takes a long and converts it to a function name and offset. %pS does something similar, but doesn't translate the address via __builtin_extract_return_addr. %pSR does the translation. This will enable replacing multiple calls like printk(...); printk_symbol(addr); printk("\n"); with a single non-interleavable in dmesg printk("... %pSR\n", (void *)addr); Update documentation too. Signed-off-by: Joe Perches Signed-off-by: Jiri Kosina --- lib/vsprintf.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 0d62fd700f68..e149c6416384 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -534,14 +534,21 @@ char *string(char *buf, char *end, const char *s, struct printf_spec spec) static noinline_for_stack char *symbol_string(char *buf, char *end, void *ptr, - struct printf_spec spec, char ext) + struct printf_spec spec, const char *fmt) { - unsigned long value = (unsigned long) ptr; + unsigned long value; #ifdef CONFIG_KALLSYMS char sym[KSYM_SYMBOL_LEN]; - if (ext == 'B') +#endif + + if (fmt[1] == 'R') + ptr = __builtin_extract_return_addr(ptr); + value = (unsigned long)ptr; + +#ifdef CONFIG_KALLSYMS + if (*fmt == 'B') sprint_backtrace(sym, value); - else if (ext != 'f' && ext != 's') + else if (*fmt != 'f' && *fmt != 's') sprint_symbol(sym, value); else sprint_symbol_no_offset(sym, value); @@ -987,6 +994,7 @@ int kptr_restrict __read_mostly; * - 'f' For simple symbolic function names without offset * - 'S' For symbolic direct pointers with offset * - 's' For symbolic direct pointers without offset + * - '[FfSs]R' as above with __builtin_extract_return_addr() translation * - 'B' For backtraced symbolic direct pointers with offset * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref] * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201] @@ -1060,7 +1068,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, case 'S': case 's': case 'B': - return symbol_string(buf, end, ptr, spec, *fmt); + return symbol_string(buf, end, ptr, spec, fmt); case 'R': case 'r': return resource_string(buf, end, ptr, spec, fmt); -- cgit v1.2.3 From 196779b9b4ce1922afabdc20d0270720603bd46c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 30 Apr 2013 15:27:12 -0700 Subject: dump_stack: consolidate dump_stack() implementations and unify their behaviors Both dump_stack() and show_stack() are currently implemented by each architecture. show_stack(NULL, NULL) dumps the backtrace for the current task as does dump_stack(). On some archs, dump_stack() prints extra information - pid, utsname and so on - in addition to the backtrace while the two are identical on other archs. The usages in arch-independent code of the two functions indicate show_stack(NULL, NULL) should print out bare backtrace while dump_stack() is used for debugging purposes when something went wrong, so it does make sense to print additional information on the task which triggered dump_stack(). There's no reason to require archs to implement two separate but mostly identical functions. It leads to unnecessary subtle information. This patch expands the dummy fallback dump_stack() implementation in lib/dump_stack.c such that it prints out debug information (taken from x86) and invokes show_stack(NULL, NULL) and drops arch-specific dump_stack() implementations in all archs except blackfin. Blackfin's dump_stack() does something wonky that I don't understand. Debug information can be printed separately by calling dump_stack_print_info() so that arch-specific dump_stack() implementation can still emit the same debug information. This is used in blackfin. This patch brings the following behavior changes. * On some archs, an extra level in backtrace for show_stack() could be printed. This is because the top frame was determined in dump_stack() on those archs while generic dump_stack() can't do that reliably. It can be compensated by inlining dump_stack() but not sure whether that'd be necessary. * Most archs didn't use to print debug info on dump_stack(). They do now. An example WARN dump follows. WARNING: at kernel/workqueue.c:4841 init_workqueues+0x35/0x505() Hardware name: empty Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.9.0-rc1-work+ #9 0000000000000009 ffff88007c861e08 ffffffff81c614dc ffff88007c861e48 ffffffff8108f50f ffffffff82228240 0000000000000040 ffffffff8234a03c 0000000000000000 0000000000000000 0000000000000000 ffff88007c861e58 Call Trace: [] dump_stack+0x19/0x1b [] warn_slowpath_common+0x7f/0xc0 [] warn_slowpath_null+0x1a/0x20 [] init_workqueues+0x35/0x505 ... v2: CPU number added to the generic debug info as requested by s390 folks and dropped the s390 specific dump_stack(). This loses %ksp from the debug message which the maintainers think isn't important enough to keep the s390-specific dump_stack() implementation. dump_stack_print_info() is moved to kernel/printk.c from lib/dump_stack.c. Because linkage is per objecct file, dump_stack_print_info() living in the same lib file as generic dump_stack() means that archs which implement custom dump_stack() - at this point, only blackfin - can't use dump_stack_print_info() as that will bring in the generic version of dump_stack() too. v1 The v1 patch broke build on blackfin due to this issue. The build breakage was reported by Fengguang Wu. Signed-off-by: Tejun Heo Acked-by: David S. Miller Acked-by: Vineet Gupta Acked-by: Jesper Nilsson Acked-by: Vineet Gupta Acked-by: Martin Schwidefsky [s390 bits] Cc: Heiko Carstens Cc: Mike Frysinger Cc: Fengguang Wu Cc: Bjorn Helgaas Cc: Sam Ravnborg Acked-by: Richard Kuo [hexagon bits] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/dump_stack.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 42f4f55c9458..53bad099ebd6 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -5,11 +5,16 @@ #include #include +#include +/** + * dump_stack - dump the current task information and its stack trace + * + * Architectures can override this implementation by implementing its own. + */ void dump_stack(void) { - printk(KERN_NOTICE - "This architecture does not implement dump_stack()\n"); + dump_stack_print_info(KERN_DEFAULT); + show_stack(NULL, NULL); } - EXPORT_SYMBOL(dump_stack); -- cgit v1.2.3 From 16c7fa05829e8b91db48e3539c5d6ff3c2b18a23 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 30 Apr 2013 15:27:30 -0700 Subject: lib/string_helpers: introduce generic string_unescape There are several places in kernel where modules unescapes input to convert C-Style Escape Sequences into byte codes. The patch provides generic implementation of such approach. Test cases are also included into the patch. [akpm@linux-foundation.org: clarify comment] [akpm@linux-foundation.org: export get_random_int() to modules] Signed-off-by: Andy Shevchenko Cc: Samuel Thibault Cc: Greg Kroah-Hartman Cc: Jason Baron Cc: Alexander Viro Cc: William Hubbs Cc: Chris Brannon Cc: Kirk Reiser Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 3 ++ lib/Makefile | 4 +- lib/string_helpers.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++ lib/test-string_helpers.c | 103 +++++++++++++++++++++++++++++++++++ 4 files changed, 242 insertions(+), 1 deletion(-) create mode 100644 lib/test-string_helpers.c (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 28be08c09bab..77ebaa3dfa12 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1463,5 +1463,8 @@ source "lib/Kconfig.kgdb" source "lib/Kconfig.kmemcheck" +config TEST_STRING_HELPERS + tristate "Test functions located in the string_helpers module at runtime" + config TEST_KSTRTOX tristate "Test kstrto*() family of functions at runtime" diff --git a/lib/Makefile b/lib/Makefile index 6e2cc561f761..23c9a0fe74fc 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -22,8 +22,10 @@ lib-y += kobject.o klist.o obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ - string_helpers.o gcd.o lcm.o list_sort.o uuid.o flex_array.o \ + gcd.o lcm.o list_sort.o uuid.o flex_array.o \ bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o +obj-y += string_helpers.o +obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o obj-y += kstrtox.o obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 1cffc223bff5..ed5c1454dd62 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -2,10 +2,12 @@ * Helpers for formatting and printing strings * * Copyright 31 August 2008 James Bottomley + * Copyright (C) 2013, Intel Corporation */ #include #include #include +#include #include /** @@ -66,3 +68,134 @@ int string_get_size(u64 size, const enum string_size_units units, return 0; } EXPORT_SYMBOL(string_get_size); + +static bool unescape_space(char **src, char **dst) +{ + char *p = *dst, *q = *src; + + switch (*q) { + case 'n': + *p = '\n'; + break; + case 'r': + *p = '\r'; + break; + case 't': + *p = '\t'; + break; + case 'v': + *p = '\v'; + break; + case 'f': + *p = '\f'; + break; + default: + return false; + } + *dst += 1; + *src += 1; + return true; +} + +static bool unescape_octal(char **src, char **dst) +{ + char *p = *dst, *q = *src; + u8 num; + + if (isodigit(*q) == 0) + return false; + + num = (*q++) & 7; + while (num < 32 && isodigit(*q) && (q - *src < 3)) { + num <<= 3; + num += (*q++) & 7; + } + *p = num; + *dst += 1; + *src = q; + return true; +} + +static bool unescape_hex(char **src, char **dst) +{ + char *p = *dst, *q = *src; + int digit; + u8 num; + + if (*q++ != 'x') + return false; + + num = digit = hex_to_bin(*q++); + if (digit < 0) + return false; + + digit = hex_to_bin(*q); + if (digit >= 0) { + q++; + num = (num << 4) | digit; + } + *p = num; + *dst += 1; + *src = q; + return true; +} + +static bool unescape_special(char **src, char **dst) +{ + char *p = *dst, *q = *src; + + switch (*q) { + case '\"': + *p = '\"'; + break; + case '\\': + *p = '\\'; + break; + case 'a': + *p = '\a'; + break; + case 'e': + *p = '\e'; + break; + default: + return false; + } + *dst += 1; + *src += 1; + return true; +} + +int string_unescape(char *src, char *dst, size_t size, unsigned int flags) +{ + char *out = dst; + + while (*src && --size) { + if (src[0] == '\\' && src[1] != '\0' && size > 1) { + src++; + size--; + + if (flags & UNESCAPE_SPACE && + unescape_space(&src, &out)) + continue; + + if (flags & UNESCAPE_OCTAL && + unescape_octal(&src, &out)) + continue; + + if (flags & UNESCAPE_HEX && + unescape_hex(&src, &out)) + continue; + + if (flags & UNESCAPE_SPECIAL && + unescape_special(&src, &out)) + continue; + + *out++ = '\\'; + } + *out++ = *src++; + } + *out = '\0'; + + return out - dst; +} +EXPORT_SYMBOL(string_unescape); diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c new file mode 100644 index 000000000000..6ac48de04c0e --- /dev/null +++ b/lib/test-string_helpers.c @@ -0,0 +1,103 @@ +/* + * Test cases for lib/string_helpers.c module. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +struct test_string { + const char *in; + const char *out; + unsigned int flags; +}; + +static const struct test_string strings[] __initconst = { + { + .in = "\\f\\ \\n\\r\\t\\v", + .out = "\f\\ \n\r\t\v", + .flags = UNESCAPE_SPACE, + }, + { + .in = "\\40\\1\\387\\0064\\05\\040\\8a\\110\\777", + .out = " \001\00387\0064\005 \\8aH?7", + .flags = UNESCAPE_OCTAL, + }, + { + .in = "\\xv\\xa\\x2c\\xD\\x6f2", + .out = "\\xv\n,\ro2", + .flags = UNESCAPE_HEX, + }, + { + .in = "\\h\\\\\\\"\\a\\e\\", + .out = "\\h\\\"\a\e\\", + .flags = UNESCAPE_SPECIAL, + }, +}; + +static void __init test_string_unescape(unsigned int flags, bool inplace) +{ + char in[256]; + char out_test[256]; + char out_real[256]; + int i, p = 0, q_test = 0, q_real = sizeof(out_real); + + for (i = 0; i < ARRAY_SIZE(strings); i++) { + const char *s = strings[i].in; + int len = strlen(strings[i].in); + + /* Copy string to in buffer */ + memcpy(&in[p], s, len); + p += len; + + /* Copy expected result for given flags */ + if (flags & strings[i].flags) { + s = strings[i].out; + len = strlen(strings[i].out); + } + memcpy(&out_test[q_test], s, len); + q_test += len; + } + in[p++] = '\0'; + + /* Call string_unescape and compare result */ + if (inplace) { + memcpy(out_real, in, p); + if (flags == UNESCAPE_ANY) + q_real = string_unescape_any_inplace(out_real); + else + q_real = string_unescape_inplace(out_real, flags); + } else if (flags == UNESCAPE_ANY) { + q_real = string_unescape_any(in, out_real, q_real); + } else { + q_real = string_unescape(in, out_real, q_real, flags); + } + + if (q_real != q_test || memcmp(out_test, out_real, q_test)) { + pr_warn("Test failed: flags = %u\n", flags); + print_hex_dump(KERN_WARNING, "Input: ", + DUMP_PREFIX_NONE, 16, 1, in, p - 1, true); + print_hex_dump(KERN_WARNING, "Expected: ", + DUMP_PREFIX_NONE, 16, 1, out_test, q_test, true); + print_hex_dump(KERN_WARNING, "Got: ", + DUMP_PREFIX_NONE, 16, 1, out_real, q_real, true); + } +} + +static int __init test_string_helpers_init(void) +{ + unsigned int i; + + pr_info("Running tests...\n"); + for (i = 0; i < UNESCAPE_ANY + 1; i++) + test_string_unescape(i, false); + test_string_unescape(get_random_int() % (UNESCAPE_ANY + 1), true); + + return -EINVAL; +} +module_init(test_string_helpers_init); +MODULE_LICENSE("Dual BSD/GPL"); -- cgit v1.2.3 From d338b1379f96b20e63aa65082e19e9a18a93b608 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 30 Apr 2013 15:27:32 -0700 Subject: dynamic_debug: reuse generic string_unescape function There is kernel function to do the job in generic way. Let's use it. Signed-off-by: Andy Shevchenko Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/dynamic_debug.c | 48 +++++------------------------------------------- 1 file changed, 5 insertions(+), 43 deletions(-) (limited to 'lib') diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 46032453abd5..99fec3ae405a 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -276,47 +277,6 @@ static inline int parse_lineno(const char *str, unsigned int *val) return 0; } -/* - * Undo octal escaping in a string, inplace. This is useful to - * allow the user to express a query which matches a format - * containing embedded spaces. - */ -static char *unescape(char *str) -{ - char *in = str; - char *out = str; - - while (*in) { - if (*in == '\\') { - if (in[1] == '\\') { - *out++ = '\\'; - in += 2; - continue; - } else if (in[1] == 't') { - *out++ = '\t'; - in += 2; - continue; - } else if (in[1] == 'n') { - *out++ = '\n'; - in += 2; - continue; - } else if (isodigit(in[1]) && - isodigit(in[2]) && - isodigit(in[3])) { - *out++ = (((in[1] - '0') << 6) | - ((in[2] - '0') << 3) | - (in[3] - '0')); - in += 4; - continue; - } - } - *out++ = *in++; - } - *out = '\0'; - - return str; -} - static int check_set(const char **dest, char *src, char *name) { int rc = 0; @@ -370,8 +330,10 @@ static int ddebug_parse_query(char *words[], int nwords, } else if (!strcmp(words[i], "module")) { rc = check_set(&query->module, words[i+1], "module"); } else if (!strcmp(words[i], "format")) { - rc = check_set(&query->format, unescape(words[i+1]), - "format"); + string_unescape_inplace(words[i+1], UNESCAPE_SPACE | + UNESCAPE_OCTAL | + UNESCAPE_SPECIAL); + rc = check_set(&query->format, words[i+1], "format"); } else if (!strcmp(words[i], "line")) { char *first = words[i+1]; char *last = strchr(first, '-'); -- cgit v1.2.3 From 4130f0efbfe5adb360328b777afa8e45f7e467f7 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 30 Apr 2013 15:28:24 -0700 Subject: rbtree_test: add extra rbtree integrity check Account for the rbtree having 2**bh(v)-1 internal nodes. While this can be seen as a consequence of other checks, Michel states that it nicely sums up what the other properties are for. Signed-off-by: Davidlohr Bueso Reviewed-by: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree_test.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index af38aedbd874..99515038ff6d 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -117,8 +117,7 @@ static int black_path_count(struct rb_node *rb) static void check(int nr_nodes) { struct rb_node *rb; - int count = 0; - int blacks = 0; + int count = 0, blacks = 0; u32 prev_key = 0; for (rb = rb_first(&root); rb; rb = rb_next(rb)) { @@ -134,7 +133,9 @@ static void check(int nr_nodes) prev_key = node->key; count++; } + WARN_ON_ONCE(count != nr_nodes); + WARN_ON_ONCE(count < (1 << black_path_count(rb_last(&root))) - 1); } static void check_augmented(int nr_nodes) -- cgit v1.2.3 From c75aaa8ed03eb1312ed2990f1a716b2b9cc0df42 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 30 Apr 2013 15:28:25 -0700 Subject: rbtree_test: add __init/__exit annotations Signed-off-by: Davidlohr Bueso Reviewed-by: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index 99515038ff6d..122f02f9941b 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -149,7 +149,7 @@ static void check_augmented(int nr_nodes) } } -static int rbtree_test_init(void) +static int __init rbtree_test_init(void) { int i, j; cycles_t time1, time2, time; @@ -222,7 +222,7 @@ static int rbtree_test_init(void) return -EAGAIN; /* Fail will directly unload the module */ } -static void rbtree_test_exit(void) +static void __exit rbtree_test_exit(void) { printk(KERN_ALERT "test exit\n"); } -- cgit v1.2.3 From 446f24d1199e8a546ba7c97da3fbb9a505a94795 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 30 Apr 2013 15:28:42 -0700 Subject: Kconfig: consolidate CONFIG_DEBUG_STRICT_USER_COPY_CHECKS The help text for this config is duplicated across the x86, parisc, and s390 Kconfig.debug files. Arnd Bergman noted that the help text was slightly misleading and should be fixed to state that enabling this option isn't a problem when using pre 4.4 gcc. To simplify the rewording, consolidate the text into lib/Kconfig.debug and modify it there to be more explicit about when you should say N to this config. Also, make the text a bit more generic by stating that this option enables compile time checks so we can cover architectures which emit warnings vs. ones which emit errors. The details of how an architecture decided to implement the checks isn't as important as the concept of compile time checking of copy_from_user() calls. While we're doing this, remove all the copy_from_user_overflow() code that's duplicated many times and place it into lib/ so that any architecture supporting this option can get the function for free. Signed-off-by: Stephen Boyd Acked-by: Arnd Bergmann Acked-by: Ingo Molnar Acked-by: H. Peter Anvin Cc: Arjan van de Ven Acked-by: Helge Deller Cc: Heiko Carstens Cc: Stephen Rothwell Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 18 ++++++++++++++++++ lib/Makefile | 1 + lib/usercopy.c | 9 +++++++++ 3 files changed, 28 insertions(+) create mode 100644 lib/usercopy.c (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 77ebaa3dfa12..770a422a42e8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1292,6 +1292,24 @@ config LATENCYTOP Enable this option if you want to use the LatencyTOP tool to find out which userspace is blocking on what kernel operations. +config ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + bool + +config DEBUG_STRICT_USER_COPY_CHECKS + bool "Strict user copy size checks" + depends on ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS + depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING + help + Enabling this option turns a certain set of sanity checks for user + copy operations into compile time failures. + + The copy_from_user() etc checks are there to help test if there + are sufficient security checks on the length argument of + the copy operation, by having gcc prove that the argument is + within bounds. + + If unsure, say N. + source mm/Kconfig.debug source kernel/trace/Kconfig diff --git a/lib/Makefile b/lib/Makefile index 23c9a0fe74fc..e9c52e1b853a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -15,6 +15,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o +obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o diff --git a/lib/usercopy.c b/lib/usercopy.c new file mode 100644 index 000000000000..4f5b1ddbcd25 --- /dev/null +++ b/lib/usercopy.c @@ -0,0 +1,9 @@ +#include +#include +#include + +void copy_from_user_overflow(void) +{ + WARN(1, "Buffer overflow detected!\n"); +} +EXPORT_SYMBOL(copy_from_user_overflow); -- cgit v1.2.3 From e12a95f40ac78f223deab743a5228f9eb3df8ed1 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 30 Apr 2013 15:28:49 -0700 Subject: notifier-error-inject: fix module names in Kconfig The Kconfig help text for MEMORY_NOTIFIER_ERROR_INJECT and OF_RECONFIG_NOTIFIER_ERROR_INJECT has mismatched module names. Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 770a422a42e8..566cf2bc08ea 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1192,7 +1192,7 @@ config MEMORY_NOTIFIER_ERROR_INJECT bash: echo: write error: Cannot allocate memory To compile this code as a module, choose M here: the module will - be called pSeries-reconfig-notifier-error-inject. + be called memory-notifier-error-inject. If unsure, say N. @@ -1209,7 +1209,7 @@ config OF_RECONFIG_NOTIFIER_ERROR_INJECT notified, write the error code to "actions//error". To compile this code as a module, choose M here: the module will - be called memory-notifier-error-inject. + be called of-reconfig-notifier-error-inject. If unsure, say N. -- cgit v1.2.3 From 6f9982bdde99d4e6f248613ca780c63de26ba086 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 30 Apr 2013 15:28:50 -0700 Subject: lib/decompress.c: fix initconst Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/decompress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/decompress.c b/lib/decompress.c index 31a804277282..f8fdedaf7b3d 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -38,7 +38,7 @@ struct compress_format { decompress_fn decompressor; }; -static const struct compress_format compressed_formats[] __initdata = { +static const struct compress_format compressed_formats[] __initconst = { { {037, 0213}, "gzip", gunzip }, { {037, 0236}, "gzip", gunzip }, { {0x42, 0x5a}, "bzip2", bunzip2 }, -- cgit v1.2.3 From 9e6879460c8edb0cd3c24c09b83d06541b5af0dc Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 4 May 2013 08:48:27 +0100 Subject: Give the OID registry file module info to avoid kernel tainting Give the OID registry file module information so that it doesn't taint the kernel when compiled as a module and loaded. Reported-by: Dros Adamson Signed-off-by: David Howells cc: Trond Myklebust cc: stable@vger.kernel.org cc: linux-nfs@vger.kernel.org Signed-off-by: Linus Torvalds --- lib/oid_registry.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'lib') diff --git a/lib/oid_registry.c b/lib/oid_registry.c index d8de11f45908..318f382a010d 100644 --- a/lib/oid_registry.c +++ b/lib/oid_registry.c @@ -9,6 +9,7 @@ * 2 of the Licence, or (at your option) any later version. */ +#include #include #include #include @@ -16,6 +17,10 @@ #include #include "oid_registry_data.c" +MODULE_DESCRIPTION("OID Registry"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + /** * look_up_OID - Find an OID registration for the specified data * @data: Binary representation of the OID -- cgit v1.2.3 From e2d57f782c8a906f80d5b5f54da803c1638929d7 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:49 -0700 Subject: rwsem: make the waiter type an enumeration rather than a bitmask We are not planning to add some new waiter flags, so we can convert the waiter type into an enumeration. Background: David Howells suggested I do this back when I tried adding a new waiter type for unfair readers. However, I believe the cleanup applies regardless of that use case. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem-spinlock.c | 19 +++++++++++-------- lib/rwsem.c | 23 +++++++++++++---------- 2 files changed, 24 insertions(+), 18 deletions(-) (limited to 'lib') diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c index 7542afbb22b3..5f117f37ac0a 100644 --- a/lib/rwsem-spinlock.c +++ b/lib/rwsem-spinlock.c @@ -9,12 +9,15 @@ #include #include +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + struct rwsem_waiter { struct list_head list; struct task_struct *task; - unsigned int flags; -#define RWSEM_WAITING_FOR_READ 0x00000001 -#define RWSEM_WAITING_FOR_WRITE 0x00000002 + enum rwsem_waiter_type type; }; int rwsem_is_locked(struct rw_semaphore *sem) @@ -68,7 +71,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); if (!wakewrite) { - if (waiter->flags & RWSEM_WAITING_FOR_WRITE) + if (waiter->type == RWSEM_WAITING_FOR_WRITE) goto out; goto dont_wake_writers; } @@ -78,7 +81,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) * to -1 here to indicate we get the lock. Instead, we wake it up * to let it go get it again. */ - if (waiter->flags & RWSEM_WAITING_FOR_WRITE) { + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { wake_up_process(waiter->task); goto out; } @@ -86,7 +89,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) /* grant an infinite number of read locks to the front of the queue */ dont_wake_writers: woken = 0; - while (waiter->flags & RWSEM_WAITING_FOR_READ) { + while (waiter->type == RWSEM_WAITING_FOR_READ) { struct list_head *next = waiter->list.next; list_del(&waiter->list); @@ -144,7 +147,7 @@ void __sched __down_read(struct rw_semaphore *sem) /* set up my own style of waitqueue */ waiter.task = tsk; - waiter.flags = RWSEM_WAITING_FOR_READ; + waiter.type = RWSEM_WAITING_FOR_READ; get_task_struct(tsk); list_add_tail(&waiter.list, &sem->wait_list); @@ -201,7 +204,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) /* set up my own style of waitqueue */ tsk = current; waiter.task = tsk; - waiter.flags = RWSEM_WAITING_FOR_WRITE; + waiter.type = RWSEM_WAITING_FOR_WRITE; list_add_tail(&waiter.list, &sem->wait_list); /* wait for someone to release the lock */ diff --git a/lib/rwsem.c b/lib/rwsem.c index ad5e0df16ab4..672eb33218ac 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -30,12 +30,15 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, EXPORT_SYMBOL(__init_rwsem); +enum rwsem_waiter_type { + RWSEM_WAITING_FOR_WRITE, + RWSEM_WAITING_FOR_READ +}; + struct rwsem_waiter { struct list_head list; struct task_struct *task; - unsigned int flags; -#define RWSEM_WAITING_FOR_READ 0x00000001 -#define RWSEM_WAITING_FOR_WRITE 0x00000002 + enum rwsem_waiter_type type; }; /* Wake types for __rwsem_do_wake(). Note that RWSEM_WAKE_NO_ACTIVE and @@ -65,7 +68,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) signed long woken, loop, adjustment; waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE)) + if (waiter->type != RWSEM_WAITING_FOR_WRITE) goto readers_only; if (wake_type == RWSEM_WAKE_READ_OWNED) @@ -112,10 +115,10 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) waiter = list_entry(waiter->list.next, struct rwsem_waiter, list); - } while (waiter->flags & RWSEM_WAITING_FOR_READ); + } while (waiter->type != RWSEM_WAITING_FOR_WRITE); adjustment = woken * RWSEM_ACTIVE_READ_BIAS; - if (waiter->flags & RWSEM_WAITING_FOR_READ) + if (waiter->type != RWSEM_WAITING_FOR_WRITE) /* hit end of list above */ adjustment -= RWSEM_WAITING_BIAS; @@ -148,7 +151,7 @@ static int try_get_writer_sem(struct rw_semaphore *sem, /* only steal when first waiter is writing */ fwaiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - if (!(fwaiter->flags & RWSEM_WAITING_FOR_WRITE)) + if (fwaiter->type != RWSEM_WAITING_FOR_WRITE) return 0; adjustment = RWSEM_ACTIVE_WRITE_BIAS; @@ -179,7 +182,7 @@ try_again_write: */ static struct rw_semaphore __sched * rwsem_down_failed_common(struct rw_semaphore *sem, - unsigned int flags, signed long adjustment) + enum rwsem_waiter_type type, signed long adjustment) { struct rwsem_waiter waiter; struct task_struct *tsk = current; @@ -190,7 +193,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem, /* set up my own style of waitqueue */ raw_spin_lock_irq(&sem->wait_lock); waiter.task = tsk; - waiter.flags = flags; + waiter.type = type; get_task_struct(tsk); if (list_empty(&sem->wait_list)) @@ -221,7 +224,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem, raw_spin_lock_irq(&sem->wait_lock); /* Try to get the writer sem, may steal from the head writer: */ - if (flags == RWSEM_WAITING_FOR_WRITE) + if (type == RWSEM_WAITING_FOR_WRITE) if (try_get_writer_sem(sem, &waiter)) { raw_spin_unlock_irq(&sem->wait_lock); return sem; -- cgit v1.2.3 From f7dd1cee9a4e2b1450e4a3732636dfbf28562ee4 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:50 -0700 Subject: rwsem: shorter spinlocked section in rwsem_down_failed_common() This change reduces the size of the spinlocked and TASK_UNINTERRUPTIBLE sections in rwsem_down_failed_common(): - We only need the sem->wait_lock to insert ourselves on the wait_list; the waiter node can be prepared outside of the wait_lock. - The task state only needs to be set to TASK_UNINTERRUPTIBLE immediately before checking if we actually need to sleep; it doesn't need to protect the entire function. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index 672eb33218ac..40636454cf3c 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -188,14 +188,12 @@ rwsem_down_failed_common(struct rw_semaphore *sem, struct task_struct *tsk = current; signed long count; - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - /* set up my own style of waitqueue */ - raw_spin_lock_irq(&sem->wait_lock); waiter.task = tsk; waiter.type = type; get_task_struct(tsk); + raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) adjustment += RWSEM_WAITING_BIAS; list_add_tail(&waiter.list, &sem->wait_list); @@ -218,7 +216,8 @@ rwsem_down_failed_common(struct rw_semaphore *sem, raw_spin_unlock_irq(&sem->wait_lock); /* wait to be given the lock */ - for (;;) { + while (true) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (!waiter.task) break; @@ -231,7 +230,6 @@ rwsem_down_failed_common(struct rw_semaphore *sem, } raw_spin_unlock_irq(&sem->wait_lock); schedule(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); } tsk->state = TASK_RUNNING; -- cgit v1.2.3 From 1e78277ccbbb48af32a618d1ef0e8534e0b648d7 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:51 -0700 Subject: rwsem: move rwsem_down_failed_common code into rwsem_down_{read,write}_failed Remove the rwsem_down_failed_common function and replace it with two identical copies of its code in rwsem_down_{read,write}_failed. This is because we want to make different optimizations in rwsem_down_{read,write}_failed; we are adding this pure-duplication step as a separate commit in order to make it easier to check the following steps. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 57 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index 40636454cf3c..fb658af1c12c 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -178,12 +178,12 @@ try_again_write: } /* - * wait for a lock to be granted + * wait for the read lock to be granted */ -static struct rw_semaphore __sched * -rwsem_down_failed_common(struct rw_semaphore *sem, - enum rwsem_waiter_type type, signed long adjustment) +struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { + enum rwsem_waiter_type type = RWSEM_WAITING_FOR_READ; + signed long adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; signed long count; @@ -237,22 +237,64 @@ rwsem_down_failed_common(struct rw_semaphore *sem, return sem; } -/* - * wait for the read lock to be granted - */ -struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) -{ - return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_READ, - -RWSEM_ACTIVE_READ_BIAS); -} - /* * wait for the write lock to be granted */ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { - return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_WRITE, - -RWSEM_ACTIVE_WRITE_BIAS); + enum rwsem_waiter_type type = RWSEM_WAITING_FOR_WRITE; + signed long adjustment = -RWSEM_ACTIVE_WRITE_BIAS; + struct rwsem_waiter waiter; + struct task_struct *tsk = current; + signed long count; + + /* set up my own style of waitqueue */ + waiter.task = tsk; + waiter.type = type; + get_task_struct(tsk); + + raw_spin_lock_irq(&sem->wait_lock); + if (list_empty(&sem->wait_list)) + adjustment += RWSEM_WAITING_BIAS; + list_add_tail(&waiter.list, &sem->wait_list); + + /* we're now waiting on the lock, but no longer actively locking */ + count = rwsem_atomic_update(adjustment, sem); + + /* If there are no active locks, wake the front queued process(es) up. + * + * Alternatively, if we're called from a failed down_write(), there + * were already threads queued before us and there are no active + * writers, the lock must be read owned; so we try to wake any read + * locks that were queued ahead of us. */ + if (count == RWSEM_WAITING_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE); + else if (count > RWSEM_WAITING_BIAS && + adjustment == -RWSEM_ACTIVE_WRITE_BIAS) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); + + raw_spin_unlock_irq(&sem->wait_lock); + + /* wait to be given the lock */ + while (true) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!waiter.task) + break; + + raw_spin_lock_irq(&sem->wait_lock); + /* Try to get the writer sem, may steal from the head writer: */ + if (type == RWSEM_WAITING_FOR_WRITE) + if (try_get_writer_sem(sem, &waiter)) { + raw_spin_unlock_irq(&sem->wait_lock); + return sem; + } + raw_spin_unlock_irq(&sem->wait_lock); + schedule(); + } + + tsk->state = TASK_RUNNING; + + return sem; } /* -- cgit v1.2.3 From da16922cc031b9c0221c836994276ab193b31de8 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:52 -0700 Subject: rwsem: simplify rwsem_down_read_failed When trying to acquire a read lock, the RWSEM_ACTIVE_READ_BIAS adjustment doesn't cause other readers to block, so we never have to worry about waking them back after canceling this adjustment in rwsem_down_read_failed(). We also never want to steal the lock in rwsem_down_read_failed(), so we don't have to grab the wait_lock either. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index fb658af1c12c..66f307e90761 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -182,7 +182,6 @@ try_again_write: */ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { - enum rwsem_waiter_type type = RWSEM_WAITING_FOR_READ; signed long adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; @@ -190,7 +189,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* set up my own style of waitqueue */ waiter.task = tsk; - waiter.type = type; + waiter.type = RWSEM_WAITING_FOR_READ; get_task_struct(tsk); raw_spin_lock_irq(&sem->wait_lock); @@ -201,17 +200,9 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* we're now waiting on the lock, but no longer actively locking */ count = rwsem_atomic_update(adjustment, sem); - /* If there are no active locks, wake the front queued process(es) up. - * - * Alternatively, if we're called from a failed down_write(), there - * were already threads queued before us and there are no active - * writers, the lock must be read owned; so we try to wake any read - * locks that were queued ahead of us. */ + /* If there are no active locks, wake the front queued process(es). */ if (count == RWSEM_WAITING_BIAS) sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE); - else if (count > RWSEM_WAITING_BIAS && - adjustment == -RWSEM_ACTIVE_WRITE_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); raw_spin_unlock_irq(&sem->wait_lock); @@ -220,15 +211,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (!waiter.task) break; - - raw_spin_lock_irq(&sem->wait_lock); - /* Try to get the writer sem, may steal from the head writer: */ - if (type == RWSEM_WAITING_FOR_WRITE) - if (try_get_writer_sem(sem, &waiter)) { - raw_spin_unlock_irq(&sem->wait_lock); - return sem; - } - raw_spin_unlock_irq(&sem->wait_lock); schedule(); } -- cgit v1.2.3 From 023fe4f712028d25b42d31984abae1f3d3f0e3e2 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:53 -0700 Subject: rwsem: simplify rwsem_down_write_failed When waking writers, we never grant them the lock - instead, they have to acquire it themselves when they run, and remove themselves from the wait_list when they succeed. As a result, we can do a few simplifications in rwsem_down_write_failed(): - We don't need to check for !waiter.task since __rwsem_do_wake() doesn't remove writers from the wait_list - There is no point releaseing the wait_lock before entering the wait loop, as we will need to reacquire it immediately. We can change the loop so that the lock is always held at the start of each loop iteration. - We don't need to get a reference on the task structure, since the task is responsible for removing itself from the wait_list. There is no risk, like in the rwsem_down_read_failed() case, that a task would wake up and exit (thus destroying its task structure) while __rwsem_do_wake() is still running - wait_lock protects against that. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem.c | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index 66f307e90761..c73bd96dc30c 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -161,16 +161,8 @@ static int try_get_writer_sem(struct rw_semaphore *sem, try_again_write: oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; - if (!(oldcount & RWSEM_ACTIVE_MASK)) { - /* No active lock: */ - struct task_struct *tsk = waiter->task; - - list_del(&waiter->list); - smp_mb(); - put_task_struct(tsk); - tsk->state = TASK_RUNNING; + if (!(oldcount & RWSEM_ACTIVE_MASK)) return 1; - } /* some one grabbed the sem already */ if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK) return 0; @@ -220,11 +212,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) } /* - * wait for the write lock to be granted + * wait until we successfully acquire the write lock */ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { - enum rwsem_waiter_type type = RWSEM_WAITING_FOR_WRITE; signed long adjustment = -RWSEM_ACTIVE_WRITE_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; @@ -232,8 +223,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) /* set up my own style of waitqueue */ waiter.task = tsk; - waiter.type = type; - get_task_struct(tsk); + waiter.type = RWSEM_WAITING_FOR_WRITE; raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) @@ -255,25 +245,20 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) adjustment == -RWSEM_ACTIVE_WRITE_BIAS) sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); - raw_spin_unlock_irq(&sem->wait_lock); - - /* wait to be given the lock */ + /* wait until we successfully acquire the lock */ while (true) { set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (!waiter.task) + + if (try_get_writer_sem(sem, &waiter)) break; - raw_spin_lock_irq(&sem->wait_lock); - /* Try to get the writer sem, may steal from the head writer: */ - if (type == RWSEM_WAITING_FOR_WRITE) - if (try_get_writer_sem(sem, &waiter)) { - raw_spin_unlock_irq(&sem->wait_lock); - return sem; - } raw_spin_unlock_irq(&sem->wait_lock); schedule(); + raw_spin_lock_irq(&sem->wait_lock); } + list_del(&waiter.list); + raw_spin_unlock_irq(&sem->wait_lock); tsk->state = TASK_RUNNING; return sem; -- cgit v1.2.3 From ed00f64346631dff035adfb9b0240daaa8b46c4e Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:54 -0700 Subject: rwsem: more agressive lock stealing in rwsem_down_write_failed Some small code simplifications can be achieved by doing more agressive lock stealing: - When rwsem_down_write_failed() notices that there are no active locks (and thus no thread to wake us if we decided to sleep), it used to wake the first queued process. However, stealing the lock is also sufficient to deal with this case, so we don't need this check anymore. - In try_get_writer_sem(), we can steal the lock even when the first waiter is a reader. This is correct because the code path that wakes readers is protected by the wait_lock. As to the performance effects of this change, they are expected to be minimal: readers are still granted the lock (rather than having to acquire it themselves) when they reach the front of the wait queue, so we have essentially the same behavior as in rwsem-spinlock. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem.c | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index c73bd96dc30c..2360bf204098 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -143,20 +143,12 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) } /* Try to get write sem, caller holds sem->wait_lock: */ -static int try_get_writer_sem(struct rw_semaphore *sem, - struct rwsem_waiter *waiter) +static int try_get_writer_sem(struct rw_semaphore *sem) { - struct rwsem_waiter *fwaiter; long oldcount, adjustment; - /* only steal when first waiter is writing */ - fwaiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - if (fwaiter->type != RWSEM_WAITING_FOR_WRITE) - return 0; - adjustment = RWSEM_ACTIVE_WRITE_BIAS; - /* Only one waiter in the queue: */ - if (fwaiter == waiter && waiter->list.next == &sem->wait_list) + if (list_is_singular(&sem->wait_list)) adjustment -= RWSEM_WAITING_BIAS; try_again_write: @@ -233,23 +225,18 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) /* we're now waiting on the lock, but no longer actively locking */ count = rwsem_atomic_update(adjustment, sem); - /* If there are no active locks, wake the front queued process(es) up. - * - * Alternatively, if we're called from a failed down_write(), there - * were already threads queued before us and there are no active - * writers, the lock must be read owned; so we try to wake any read - * locks that were queued ahead of us. */ - if (count == RWSEM_WAITING_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE); - else if (count > RWSEM_WAITING_BIAS && - adjustment == -RWSEM_ACTIVE_WRITE_BIAS) + /* If there were already threads queued before us and there are no + * active writers, the lock must be read owned; so we try to wake + * any read locks that were queued ahead of us. */ + if (count > RWSEM_WAITING_BIAS && + adjustment == -RWSEM_ACTIVE_WRITE_BIAS) sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); /* wait until we successfully acquire the lock */ while (true) { set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (try_get_writer_sem(sem, &waiter)) + if (try_get_writer_sem(sem)) break; raw_spin_unlock_irq(&sem->wait_lock); -- cgit v1.2.3 From 5ede972df1cd9294c82e9515949fd2103be81d7b Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:55 -0700 Subject: rwsem: use cmpxchg for trying to steal write lock Using rwsem_atomic_update to try stealing the write lock forced us to undo the adjustment in the failure path. We can have simpler and faster code by using cmpxchg instead. Signed-off-by: Michel Lespinasse Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Acked-by: Rik van Riel Signed-off-by: Linus Torvalds --- lib/rwsem.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index 2360bf204098..64c2dc007be2 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -142,25 +142,6 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) return sem; } -/* Try to get write sem, caller holds sem->wait_lock: */ -static int try_get_writer_sem(struct rw_semaphore *sem) -{ - long oldcount, adjustment; - - adjustment = RWSEM_ACTIVE_WRITE_BIAS; - if (list_is_singular(&sem->wait_list)) - adjustment -= RWSEM_WAITING_BIAS; - -try_again_write: - oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; - if (!(oldcount & RWSEM_ACTIVE_MASK)) - return 1; - /* some one grabbed the sem already */ - if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK) - return 0; - goto try_again_write; -} - /* * wait for the read lock to be granted */ @@ -236,7 +217,12 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) while (true) { set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (try_get_writer_sem(sem)) + /* Try acquiring the write lock. */ + count = RWSEM_ACTIVE_WRITE_BIAS; + if (!list_is_singular(&sem->wait_list)) + count += RWSEM_WAITING_BIAS; + if (cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == + RWSEM_WAITING_BIAS) break; raw_spin_unlock_irq(&sem->wait_lock); -- cgit v1.2.3 From a7d2c573ae7fad1b2c877d1a1342fa5bb0d6478c Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:56 -0700 Subject: rwsem: avoid taking wait_lock in rwsem_down_write_failed In rwsem_down_write_failed(), if there are active locks after we wake up (i.e. the lock got stolen from us), skip taking the wait_lock and go back to sleep immediately. Signed-off-by: Michel Lespinasse Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Acked-by: Rik van Riel Signed-off-by: Linus Torvalds --- lib/rwsem.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index 64c2dc007be2..edf3d9ca670e 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -214,8 +214,8 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); /* wait until we successfully acquire the lock */ + set_task_state(tsk, TASK_UNINTERRUPTIBLE); while (true) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); /* Try acquiring the write lock. */ count = RWSEM_ACTIVE_WRITE_BIAS; @@ -226,7 +226,13 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) break; raw_spin_unlock_irq(&sem->wait_lock); - schedule(); + + /* Block until there are no active lockers. */ + do { + schedule(); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + } while (sem->count & RWSEM_ACTIVE_MASK); + raw_spin_lock_irq(&sem->wait_lock); } -- cgit v1.2.3 From 9b0fc9c09f1b262b7fe697eba6b05095d78850e5 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:57 -0700 Subject: rwsem: skip initial trylock in rwsem_down_write_failed We can skip the initial trylock in rwsem_down_write_failed() if there are known active lockers already, thus saving one likely-to-fail cmpxchg. Signed-off-by: Michel Lespinasse Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Acked-by: Rik van Riel Signed-off-by: Linus Torvalds --- lib/rwsem.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index edf3d9ca670e..0d50e46d5b0c 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -216,14 +216,15 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) /* wait until we successfully acquire the lock */ set_task_state(tsk, TASK_UNINTERRUPTIBLE); while (true) { - - /* Try acquiring the write lock. */ - count = RWSEM_ACTIVE_WRITE_BIAS; - if (!list_is_singular(&sem->wait_list)) - count += RWSEM_WAITING_BIAS; - if (cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == + if (!(count & RWSEM_ACTIVE_MASK)) { + /* Try acquiring the write lock. */ + count = RWSEM_ACTIVE_WRITE_BIAS; + if (!list_is_singular(&sem->wait_list)) + count += RWSEM_WAITING_BIAS; + if (cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == RWSEM_WAITING_BIAS) - break; + break; + } raw_spin_unlock_irq(&sem->wait_lock); @@ -231,7 +232,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) do { schedule(); set_task_state(tsk, TASK_UNINTERRUPTIBLE); - } while (sem->count & RWSEM_ACTIVE_MASK); + } while ((count = sem->count) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } -- cgit v1.2.3 From 8cf5322ce69afea1fab6a6270db24d057d664798 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:58 -0700 Subject: rwsem: simplify __rwsem_do_wake This is mostly for cleanup value: - We don't need several gotos to handle the case where the first waiter is a writer. Two simple tests will do (and generate very similar code). - In the remainder of the function, we know the first waiter is a reader, so we don't have to double check that. We can use do..while loops to iterate over the readers to wake (generates slightly better code). Signed-off-by: Michel Lespinasse Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem-spinlock.c | 23 +++++++---------------- lib/rwsem.c | 26 ++++++++++++-------------- 2 files changed, 19 insertions(+), 30 deletions(-) (limited to 'lib') diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c index 5f117f37ac0a..9be8a9144978 100644 --- a/lib/rwsem-spinlock.c +++ b/lib/rwsem-spinlock.c @@ -70,26 +70,17 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - if (!wakewrite) { - if (waiter->type == RWSEM_WAITING_FOR_WRITE) - goto out; - goto dont_wake_writers; - } - - /* - * as we support write lock stealing, we can't set sem->activity - * to -1 here to indicate we get the lock. Instead, we wake it up - * to let it go get it again. - */ if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - wake_up_process(waiter->task); + if (wakewrite) + /* Wake up a writer. Note that we do not grant it the + * lock - it will have to acquire it when it runs. */ + wake_up_process(waiter->task); goto out; } /* grant an infinite number of read locks to the front of the queue */ - dont_wake_writers: woken = 0; - while (waiter->type == RWSEM_WAITING_FOR_READ) { + do { struct list_head *next = waiter->list.next; list_del(&waiter->list); @@ -99,10 +90,10 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) wake_up_process(tsk); put_task_struct(tsk); woken++; - if (list_empty(&sem->wait_list)) + if (next == &sem->wait_list) break; waiter = list_entry(next, struct rwsem_waiter, list); - } + } while (waiter->type != RWSEM_WAITING_FOR_WRITE); sem->activity += woken; diff --git a/lib/rwsem.c b/lib/rwsem.c index 0d50e46d5b0c..9a675fa9d78e 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -68,20 +68,17 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) signed long woken, loop, adjustment; waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - if (waiter->type != RWSEM_WAITING_FOR_WRITE) - goto readers_only; - - if (wake_type == RWSEM_WAKE_READ_OWNED) - /* Another active reader was observed, so wakeup is not - * likely to succeed. Save the atomic op. - */ + if (waiter->type == RWSEM_WAITING_FOR_WRITE) { + if (wake_type != RWSEM_WAKE_READ_OWNED) + /* Wake writer at the front of the queue, but do not + * grant it the lock yet as we want other writers + * to be able to steal it. Readers, on the other hand, + * will block as they will notice the queued writer. + */ + wake_up_process(waiter->task); goto out; + } - /* Wake up the writing waiter and let the task grab the sem: */ - wake_up_process(waiter->task); - goto out; - - readers_only: /* If we come here from up_xxxx(), another thread might have reached * rwsem_down_failed_common() before we acquired the spinlock and * woken up a waiter, making it now active. We prefer to check for @@ -125,7 +122,8 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) rwsem_atomic_add(adjustment, sem); next = sem->wait_list.next; - for (loop = woken; loop > 0; loop--) { + loop = woken; + do { waiter = list_entry(next, struct rwsem_waiter, list); next = waiter->list.next; tsk = waiter->task; @@ -133,7 +131,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) waiter->task = NULL; wake_up_process(tsk); put_task_struct(tsk); - } + } while (--loop); sem->wait_list.next = next; next->prev = &sem->wait_list; -- cgit v1.2.3 From fe6e674c6187d4f452a679ced7e95262bd517936 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:45:59 -0700 Subject: rwsem: implement support for write lock stealing on the fastpath When we decide to wake up readers, we must first grant them as many read locks as necessary, and then actually wake up all these readers. But in order to know how many read shares to grant, we must first count the readers at the head of the queue. This might take a while if there are many readers, and we want to be protected against a writer stealing the lock while we're counting. To that end, we grant the first reader lock before counting how many more readers are queued. We also require some adjustments to the wake_type semantics. RWSEM_WAKE_NO_ACTIVE used to mean that we had found the count to be RWSEM_WAITING_BIAS, in which case the rwsem was known to be free as nobody could steal it while we hold the wait_lock. This doesn't make sense once we implement fastpath write lock stealing, so we now use RWSEM_WAKE_ANY in that case. Similarly, when rwsem_down_write_failed found that a read lock was active, it would use RWSEM_WAKE_READ_OWNED which signalled that new readers could be woken without checking first that the rwsem was available. We can't do that anymore since the existing readers might release their read locks, and a writer could steal the lock before we wake up additional readers. So, we have to use a new RWSEM_WAKE_READERS value to indicate we only want to wake readers, but we don't currently hold any read lock. Signed-off-by: Michel Lespinasse Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem.c | 64 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index 9a675fa9d78e..bbe48c04f363 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -4,6 +4,7 @@ * Derived from arch/i386/kernel/semaphore.c * * Writer lock-stealing by Alex Shi + * and Michel Lespinasse */ #include #include @@ -41,13 +42,11 @@ struct rwsem_waiter { enum rwsem_waiter_type type; }; -/* Wake types for __rwsem_do_wake(). Note that RWSEM_WAKE_NO_ACTIVE and - * RWSEM_WAKE_READ_OWNED imply that the spinlock must have been kept held - * since the rwsem value was observed. - */ -#define RWSEM_WAKE_ANY 0 /* Wake whatever's at head of wait list */ -#define RWSEM_WAKE_NO_ACTIVE 1 /* rwsem was observed with no active thread */ -#define RWSEM_WAKE_READ_OWNED 2 /* rwsem was observed to be read owned */ +enum rwsem_wake_type { + RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ + RWSEM_WAKE_READERS, /* Wake readers only */ + RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ +}; /* * handle the lock release when processes blocked on it that can now run @@ -60,16 +59,16 @@ struct rwsem_waiter { * - writers are only woken if downgrading is false */ static struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wake_type) +__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) { struct rwsem_waiter *waiter; struct task_struct *tsk; struct list_head *next; - signed long woken, loop, adjustment; + signed long oldcount, woken, loop, adjustment; waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wake_type != RWSEM_WAKE_READ_OWNED) + if (wake_type == RWSEM_WAKE_ANY) /* Wake writer at the front of the queue, but do not * grant it the lock yet as we want other writers * to be able to steal it. Readers, on the other hand, @@ -79,24 +78,24 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) goto out; } - /* If we come here from up_xxxx(), another thread might have reached - * rwsem_down_failed_common() before we acquired the spinlock and - * woken up a waiter, making it now active. We prefer to check for - * this first in order to not spend too much time with the spinlock - * held if we're not going to be able to wake up readers in the end. - * - * Note that we do not need to update the rwsem count: any writer - * trying to acquire rwsem will run rwsem_down_write_failed() due - * to the waiting threads and block trying to acquire the spinlock. - * - * We use a dummy atomic update in order to acquire the cache line - * exclusively since we expect to succeed and run the final rwsem - * count adjustment pretty soon. + /* Writers might steal the lock before we grant it to the next reader. + * We prefer to do the first reader grant before counting readers + * so we can bail out early if a writer stole the lock. */ - if (wake_type == RWSEM_WAKE_ANY && - rwsem_atomic_update(0, sem) < RWSEM_WAITING_BIAS) - /* Someone grabbed the sem for write already */ - goto out; + adjustment = 0; + if (wake_type != RWSEM_WAKE_READ_OWNED) { + adjustment = RWSEM_ACTIVE_READ_BIAS; + try_reader_grant: + oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; + if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { + /* A writer stole the lock. Undo our reader grant. */ + if (rwsem_atomic_update(-adjustment, sem) & + RWSEM_ACTIVE_MASK) + goto out; + /* Last active locker left. Retry waking readers. */ + goto try_reader_grant; + } + } /* Grant an infinite number of read locks to the readers at the front * of the queue. Note we increment the 'active part' of the count by @@ -114,12 +113,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type) } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - adjustment = woken * RWSEM_ACTIVE_READ_BIAS; + adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; if (waiter->type != RWSEM_WAITING_FOR_WRITE) /* hit end of list above */ adjustment -= RWSEM_WAITING_BIAS; - rwsem_atomic_add(adjustment, sem); + if (adjustment) + rwsem_atomic_add(adjustment, sem); next = sem->wait_list.next; loop = woken; @@ -164,8 +164,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) count = rwsem_atomic_update(adjustment, sem); /* If there are no active locks, wake the front queued process(es). */ - if (count == RWSEM_WAITING_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE); + if (!(count & RWSEM_ACTIVE_MASK)) + sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); raw_spin_unlock_irq(&sem->wait_lock); @@ -209,7 +209,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) * any read locks that were queued ahead of us. */ if (count > RWSEM_WAITING_BIAS && adjustment == -RWSEM_ACTIVE_WRITE_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); + sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); /* wait until we successfully acquire the lock */ set_task_state(tsk, TASK_UNINTERRUPTIBLE); -- cgit v1.2.3 From 25c39325968bbcebe6cd2a1991228c9dfb48d655 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 7 May 2013 06:46:00 -0700 Subject: rwsem: do not block readers at head of queue if other readers are active This change fixes a race condition where a reader might determine it needs to block, but by the time it acquires the wait_lock the rwsem has active readers and no queued waiters. In this situation the reader can run in parallel with the existing active readers; it does not need to block until the active readers complete. Thanks to Peter Hurley for noticing this possible race. Signed-off-by: Michel Lespinasse Reviewed-by: Peter Hurley Acked-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index bbe48c04f363..61f91ca75e40 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -163,8 +163,14 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) /* we're now waiting on the lock, but no longer actively locking */ count = rwsem_atomic_update(adjustment, sem); - /* If there are no active locks, wake the front queued process(es). */ - if (!(count & RWSEM_ACTIVE_MASK)) + /* If there are no active locks, wake the front queued process(es). + * + * If there are no writers and we are first in the queue, + * wake our own waiter to join the existing active readers ! + */ + if (count == RWSEM_WAITING_BIAS || + (count > RWSEM_WAITING_BIAS && + adjustment != -RWSEM_ACTIVE_READ_BIAS)) sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); raw_spin_unlock_irq(&sem->wait_lock); -- cgit v1.2.3 From b5f541810ea9fb98d93c0ee0e00e07a22874856f Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 7 May 2013 06:46:02 -0700 Subject: rwsem: no need for explicit signed longs Change explicit "signed long" declarations into plain "long" as suggested by Peter Hurley. Signed-off-by: Davidlohr Bueso Reviewed-by: Michel Lespinasse Signed-off-by: Michel Lespinasse Signed-off-by: Linus Torvalds --- lib/rwsem.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index 61f91ca75e40..cf0ad2ad19f5 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -64,7 +64,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) struct rwsem_waiter *waiter; struct task_struct *tsk; struct list_head *next; - signed long oldcount, woken, loop, adjustment; + long oldcount, woken, loop, adjustment; waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); if (waiter->type == RWSEM_WAITING_FOR_WRITE) { @@ -145,10 +145,9 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) */ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { - signed long adjustment = -RWSEM_ACTIVE_READ_BIAS; + long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; - signed long count; /* set up my own style of waitqueue */ waiter.task = tsk; @@ -193,10 +192,9 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) */ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { - signed long adjustment = -RWSEM_ACTIVE_WRITE_BIAS; + long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; - signed long count; /* set up my own style of waitqueue */ waiter.task = tsk; -- cgit v1.2.3 From 2d864e41710f1d2ba406fb62018ab0487152e6f2 Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Tue, 7 May 2013 15:37:48 -0700 Subject: kref: minor cleanup - make warning smp-safe - result of atomic _unless_zero functions should be checked by caller to avoid use-after-free error - trivial whitespace fix. Link: https://lkml.org/lkml/2013/4/12/391 Tested: compile x86, boot machine and run xfstests Signed-off-by: Anatol Pomozov [ Removed line-break, changed to use WARN_ON_ONCE() - Linus ] Signed-off-by: Linus Torvalds --- lib/kobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/kobject.c b/lib/kobject.c index a65486613d79..b7e29a6056d3 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -529,7 +529,7 @@ struct kobject *kobject_get(struct kobject *kobj) return kobj; } -static struct kobject *kobject_get_unless_zero(struct kobject *kobj) +static struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj) { if (!kref_get_unless_zero(&kobj->kref)) kobj = NULL; -- cgit v1.2.3 From 9607a85b67a9714290a78c1a56630ab1c9fa2c23 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 7 May 2013 15:39:03 -0700 Subject: rwsem: check counter to avoid cmpxchg calls This patch tries to reduce the amount of cmpxchg calls in the writer failed path by checking the counter value first before issuing the instruction. If ->count is not set to RWSEM_WAITING_BIAS then there is no point wasting a cmpxchg call. Furthermore, Michel states "I suppose it helps due to the case where someone else steals the lock while we're trying to acquire sem->wait_lock." Two very different workloads and machines were used to see how this patch improves throughput: pgbench on a quad-core laptop and aim7 on a large 8 socket box with 80 cores. Some results comparing Michel's fast-path write lock stealing (tps-rwsem) on a quad-core laptop running pgbench: | db_size | clients | tps-rwsem | tps-patch | +---------+----------+----------------+--------------+ | 160 MB | 1 | 6906 | 9153 | + 32.5 | 160 MB | 2 | 15931 | 22487 | + 41.1% | 160 MB | 4 | 33021 | 32503 | | 160 MB | 8 | 34626 | 34695 | | 160 MB | 16 | 33098 | 34003 | | 160 MB | 20 | 31343 | 31440 | | 160 MB | 30 | 28961 | 28987 | | 160 MB | 40 | 26902 | 26970 | | 160 MB | 50 | 25760 | 25810 | ------------------------------------------------------ | 1.6 GB | 1 | 7729 | 7537 | | 1.6 GB | 2 | 19009 | 23508 | + 23.7% | 1.6 GB | 4 | 33185 | 32666 | | 1.6 GB | 8 | 34550 | 34318 | | 1.6 GB | 16 | 33079 | 32689 | | 1.6 GB | 20 | 31494 | 31702 | | 1.6 GB | 30 | 28535 | 28755 | | 1.6 GB | 40 | 27054 | 27017 | | 1.6 GB | 50 | 25591 | 25560 | ------------------------------------------------------ | 7.6 GB | 1 | 6224 | 7469 | + 20.0% | 7.6 GB | 2 | 13611 | 12778 | | 7.6 GB | 4 | 33108 | 32927 | | 7.6 GB | 8 | 34712 | 34878 | | 7.6 GB | 16 | 32895 | 33003 | | 7.6 GB | 20 | 31689 | 31974 | | 7.6 GB | 30 | 29003 | 28806 | | 7.6 GB | 40 | 26683 | 26976 | | 7.6 GB | 50 | 25925 | 25652 | ------------------------------------------------------ For the aim7 worloads, they overall improved on top of Michel's patchset. For full graphs on how the rwsem series plus this patch behaves on a large 8 socket machine against a vanilla kernel: http://stgolabs.net/rwsem-aim7-results.tar.gz Signed-off-by: Davidlohr Bueso Signed-off-by: Linus Torvalds --- lib/rwsem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/rwsem.c b/lib/rwsem.c index cf0ad2ad19f5..19c5fa95e0b4 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -223,7 +223,9 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) count = RWSEM_ACTIVE_WRITE_BIAS; if (!list_is_singular(&sem->wait_list)) count += RWSEM_WAITING_BIAS; - if (cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == + + if (sem->count == RWSEM_WAITING_BIAS && + cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) == RWSEM_WAITING_BIAS) break; } -- cgit v1.2.3