From f792685006274a850e6cc0ea9ade275ccdfc90bc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 5 Mar 2013 18:05:46 +0100
Subject: math64: New div64_u64_rem helper

Provide an extended version of div64_u64() that
also returns the remainder of the division.

We are going to need this to refine the cputime
scaling code.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 lib/div64.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

(limited to 'lib')

diff --git a/lib/div64.c b/lib/div64.c
index a163b6caef73..3af5728d95fd 100644
--- a/lib/div64.c
+++ b/lib/div64.c
@@ -79,9 +79,10 @@ EXPORT_SYMBOL(div_s64_rem);
 #endif
 
 /**
- * div64_u64 - unsigned 64bit divide with 64bit divisor
+ * div64_u64_rem - unsigned 64bit divide with 64bit divisor and 64bit remainder
  * @dividend:	64bit dividend
  * @divisor:	64bit divisor
+ * @remainder:  64bit remainder
  *
  * This implementation is a modified version of the algorithm proposed
  * by the book 'Hacker's Delight'.  The original source and full proof
@@ -89,27 +90,33 @@ EXPORT_SYMBOL(div_s64_rem);
  *
  * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c.txt'
  */
-#ifndef div64_u64
-u64 div64_u64(u64 dividend, u64 divisor)
+#ifndef div64_u64_rem
+u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
 {
 	u32 high = divisor >> 32;
 	u64 quot;
 
 	if (high == 0) {
-		quot = div_u64(dividend, divisor);
+		u32 rem32;
+		quot = div_u64_rem(dividend, divisor, &rem32);
+		*remainder = rem32;
 	} else {
 		int n = 1 + fls(high);
 		quot = div_u64(dividend >> n, divisor >> n);
 
 		if (quot != 0)
 			quot--;
-		if ((dividend - quot * divisor) >= divisor)
+
+		*remainder = dividend - quot * divisor;
+		if (*remainder >= divisor) {
 			quot++;
+			*remainder -= divisor;
+		}
 	}
 
 	return quot;
 }
-EXPORT_SYMBOL(div64_u64);
+EXPORT_SYMBOL(div64_u64_rem);
 #endif
 
 /**
-- 
cgit v1.2.3


From cbe5e6109538ddab57764a88d9f0c2accd0c7d48 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 22 Mar 2013 22:17:36 -0600
Subject: lru_cache: introduce lc_get_cumulative()

New helper to be able to consolidate more updates
into a single transaction.
Without this, we can only grab a single refcount
on an updated element while preparing a transaction.

lc_get_cumulative - like lc_get; also finds to-be-changed elements
  @lc: the lru cache to operate on
  @enr: the label to look up

  Unlike lc_get this also returns the element for @enr, if it is belonging to
  a pending transaction, so the return values are like for lc_get(),
  plus:

  pointer to an element already on the "to_be_changed" list.
	  In this case, the cache was already marked %LC_DIRTY.

  Caller needs to make sure that the pending transaction is completed,
  before proceeding to actually use this element.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>

Fixed up by Jens to export lc_get_cumulative().

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 lib/lru_cache.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 46 insertions(+), 10 deletions(-)

(limited to 'lib')

diff --git a/lib/lru_cache.c b/lib/lru_cache.c
index 8335d39d2ccd..4a83ecd03650 100644
--- a/lib/lru_cache.c
+++ b/lib/lru_cache.c
@@ -365,7 +365,13 @@ static int lc_unused_element_available(struct lru_cache *lc)
 	return 0;
 }
 
-static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change)
+/* used as internal flags to __lc_get */
+enum {
+	LC_GET_MAY_CHANGE = 1,
+	LC_GET_MAY_USE_UNCOMMITTED = 2,
+};
+
+static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, unsigned int flags)
 {
 	struct lc_element *e;
 
@@ -380,22 +386,31 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool
 	 * this enr is currently being pulled in already,
 	 * and will be available once the pending transaction
 	 * has been committed. */
-	if (e && e->lc_new_number == e->lc_number) {
+	if (e) {
+		if (e->lc_new_number != e->lc_number) {
+			/* It has been found above, but on the "to_be_changed"
+			 * list, not yet committed.  Don't pull it in twice,
+			 * wait for the transaction, then try again...
+			 */
+			if (!(flags & LC_GET_MAY_USE_UNCOMMITTED))
+				RETURN(NULL);
+			/* ... unless the caller is aware of the implications,
+			 * probably preparing a cumulative transaction. */
+			++e->refcnt;
+			++lc->hits;
+			RETURN(e);
+		}
+		/* else: lc_new_number == lc_number; a real hit. */
 		++lc->hits;
 		if (e->refcnt++ == 0)
 			lc->used++;
 		list_move(&e->list, &lc->in_use); /* Not evictable... */
 		RETURN(e);
 	}
+	/* e == NULL */
 
 	++lc->misses;
-	if (!may_change)
-		RETURN(NULL);
-
-	/* It has been found above, but on the "to_be_changed" list, not yet
-	 * committed.  Don't pull it in twice, wait for the transaction, then
-	 * try again */
-	if (e)
+	if (!(flags & LC_GET_MAY_CHANGE))
 		RETURN(NULL);
 
 	/* To avoid races with lc_try_lock(), first, mark us dirty
@@ -477,7 +492,27 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool
  */
 struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
 {
-	return __lc_get(lc, enr, 1);
+	return __lc_get(lc, enr, LC_GET_MAY_CHANGE);
+}
+
+/**
+ * lc_get_cumulative - like lc_get; also finds to-be-changed elements
+ * @lc: the lru cache to operate on
+ * @enr: the label to look up
+ *
+ * Unlike lc_get this also returns the element for @enr, if it is belonging to
+ * a pending transaction, so the return values are like for lc_get(),
+ * plus:
+ *
+ * pointer to an element already on the "to_be_changed" list.
+ * 	In this case, the cache was already marked %LC_DIRTY.
+ *
+ * Caller needs to make sure that the pending transaction is completed,
+ * before proceeding to actually use this element.
+ */
+struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr)
+{
+	return __lc_get(lc, enr, LC_GET_MAY_CHANGE|LC_GET_MAY_USE_UNCOMMITTED);
 }
 
 /**
@@ -648,3 +683,4 @@ EXPORT_SYMBOL(lc_seq_printf_stats);
 EXPORT_SYMBOL(lc_seq_dump_details);
 EXPORT_SYMBOL(lc_try_lock);
 EXPORT_SYMBOL(lc_is_used);
+EXPORT_SYMBOL(lc_get_cumulative);
-- 
cgit v1.2.3


From 2db76d7c3c6db93058f983c8240f7c7c25e87ee6 Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Tue, 26 Mar 2013 15:14:18 +0200
Subject: lib/scatterlist: sg_page_iter: support sg lists w/o backing pages

The i915 driver uses sg lists for memory without backing 'struct page'
pages, similarly to other IO memory regions, setting only the DMA
address for these. It does this, so that it can program the HW MMU
tables in a uniform way both for sg lists with and without backing pages.

Without a valid page pointer we can't call nth_page to get the current
page in __sg_page_iter_next, so add a helper that relevant users can
call separately. Also add a helper to get the DMA address of the current
page (idea from Daniel).

Convert all places in i915, to use the new API.

Signed-off-by: Imre Deak <imre.deak@intel.com>
Reviewed-by: Damien Lespiau <damien.lespiau@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 lib/scatterlist.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'lib')

diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index b83c144d731f..a1cf8cae60e7 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -401,7 +401,6 @@ void __sg_page_iter_start(struct sg_page_iter *piter,
 	piter->__pg_advance = 0;
 	piter->__nents = nents;
 
-	piter->page = NULL;
 	piter->sg = sglist;
 	piter->sg_pgoffset = pgoffset;
 }
@@ -426,7 +425,6 @@ bool __sg_page_iter_next(struct sg_page_iter *piter)
 		if (!--piter->__nents || !piter->sg)
 			return false;
 	}
-	piter->page = nth_page(sg_page(piter->sg), piter->sg_pgoffset);
 
 	return true;
 }
@@ -496,7 +494,7 @@ bool sg_miter_next(struct sg_mapping_iter *miter)
 		miter->__remaining = min_t(unsigned long, miter->__remaining,
 					   PAGE_SIZE - miter->__offset);
 	}
-	miter->page = miter->piter.page;
+	miter->page = sg_page_iter_page(&miter->piter);
 	miter->consumed = miter->length = miter->__remaining;
 
 	if (miter->__flags & SG_MITER_ATOMIC)
-- 
cgit v1.2.3


From 0ecc833bac594099505a090cbca6ccd5b83d5975 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 29 Mar 2013 12:23:28 -0400
Subject: mode_t, whack-a-mole at 11...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 lib/notifier-error-inject.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/notifier-error-inject.c b/lib/notifier-error-inject.c
index 44b92cb6224f..eb4a04afea80 100644
--- a/lib/notifier-error-inject.c
+++ b/lib/notifier-error-inject.c
@@ -17,7 +17,7 @@ static int debugfs_errno_get(void *data, u64 *val)
 DEFINE_SIMPLE_ATTRIBUTE(fops_errno, debugfs_errno_get, debugfs_errno_set,
 			"%lld\n");
 
-static struct dentry *debugfs_create_errno(const char *name, mode_t mode,
+static struct dentry *debugfs_create_errno(const char *name, umode_t mode,
 				struct dentry *parent, int *value)
 {
 	return debugfs_create_file(name, mode, parent, value, &fops_errno);
@@ -50,7 +50,7 @@ struct dentry *notifier_err_inject_init(const char *name, struct dentry *parent,
 			struct notifier_err_inject *err_inject, int priority)
 {
 	struct notifier_err_inject_action *action;
-	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+	umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 	struct dentry *dir;
 	struct dentry *actions_dir;
 
-- 
cgit v1.2.3


From a49b7e82cab0f9b41f483359be83f44fbb6b4979 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 13 Apr 2013 15:15:30 -0700
Subject: kobject: fix kset_find_obj() race with concurrent last kobject_put()

Anatol Pomozov identified a race condition that hits module unloading
and re-loading.  To quote Anatol:

 "This is a race codition that exists between kset_find_obj() and
  kobject_put().  kset_find_obj() might return kobject that has refcount
  equal to 0 if this kobject is freeing by kobject_put() in other
  thread.

  Here is timeline for the crash in case if kset_find_obj() searches for
  an object tht nobody holds and other thread is doing kobject_put() on
  the same kobject:

    THREAD A (calls kset_find_obj())     THREAD B (calls kobject_put())
    splin_lock()
                                         atomic_dec_return(kobj->kref), counter gets zero here
                                         ... starts kobject cleanup ....
                                         spin_lock() // WAIT thread A in kobj_kset_leave()
    iterate over kset->list
    atomic_inc(kobj->kref) (counter becomes 1)
    spin_unlock()
                                         spin_lock() // taken
                                         // it does not know that thread A increased counter so it
                                         remove obj from list
                                         spin_unlock()
                                         vfree(module) // frees module object with containing kobj

    // kobj points to freed memory area!!
    kobject_put(kobj) // OOPS!!!!

  The race above happens because module.c tries to use kset_find_obj()
  when somebody unloads module.  The module.c code was introduced in
  commit 6494a93d55fa"

Anatol supplied a patch specific for module.c that worked around the
problem by simply not using kset_find_obj() at all, but rather than make
a local band-aid, this just fixes kset_find_obj() to be thread-safe
using the proper model of refusing the get a new reference if the
refcount has already dropped to zero.

See examples of this proper refcount handling not only in the kref
documentation, but in various other equivalent uses of this pattern by
grepping for atomic_inc_not_zero().

[ Side note: the module race does indicate that module loading and
  unloading is not properly serialized wrt sysfs information using the
  module mutex.  That may require further thought, but this is the
  correct fix at the kobject layer regardless. ]

Reported-analyzed-and-tested-by: Anatol Pomozov <anatol.pomozov@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/kobject.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/kobject.c b/lib/kobject.c
index e07ee1fcd6f1..a65486613d79 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -529,6 +529,13 @@ struct kobject *kobject_get(struct kobject *kobj)
 	return kobj;
 }
 
+static struct kobject *kobject_get_unless_zero(struct kobject *kobj)
+{
+	if (!kref_get_unless_zero(&kobj->kref))
+		kobj = NULL;
+	return kobj;
+}
+
 /*
  * kobject_cleanup - free kobject resources.
  * @kobj: object to cleanup
@@ -751,7 +758,7 @@ struct kobject *kset_find_obj(struct kset *kset, const char *name)
 
 	list_for_each_entry(k, &kset->list, entry) {
 		if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
-			ret = kobject_get(k);
+			ret = kobject_get_unless_zero(k);
 			break;
 		}
 	}
-- 
cgit v1.2.3


From 0635eb8a54cf0fea64b174bb68bc36b9c3d622db Mon Sep 17 00:00:00 2001
From: Matthew Garrett <matthew.garrett@nebula.com>
Date: Mon, 15 Apr 2013 13:09:45 -0700
Subject: Move utf16 functions to kernel core and rename

We want to be able to use the utf16 functions that are currently present
in the EFI variables code in platform-specific code as well. Move them to
the kernel core, and in the process rename them to accurately describe what
they do - they don't handle UTF16, only UCS2.

Signed-off-by: Matthew Garrett <matthew.garrett@nebula.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 lib/Kconfig       |  3 +++
 lib/Makefile      |  2 ++
 lib/ucs2_string.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+)
 create mode 100644 lib/ucs2_string.c

(limited to 'lib')

diff --git a/lib/Kconfig b/lib/Kconfig
index 3958dc4389f9..fe01d418b09a 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -404,4 +404,7 @@ config OID_REGISTRY
 	help
 	  Enable fast lookup object identifier registry.
 
+config UCS2_STRING
+        tristate
+
 endmenu
diff --git a/lib/Makefile b/lib/Makefile
index d7946ff75b2e..6e2cc561f761 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -174,3 +174,5 @@ quiet_cmd_build_OID_registry = GEN     $@
       cmd_build_OID_registry = perl $(srctree)/$(src)/build_OID_registry $< $@
 
 clean-files	+= oid_registry_data.c
+
+obj-$(CONFIG_UCS2_STRING) += ucs2_string.o
diff --git a/lib/ucs2_string.c b/lib/ucs2_string.c
new file mode 100644
index 000000000000..6f500ef2301d
--- /dev/null
+++ b/lib/ucs2_string.c
@@ -0,0 +1,51 @@
+#include <linux/ucs2_string.h>
+#include <linux/module.h>
+
+/* Return the number of unicode characters in data */
+unsigned long
+ucs2_strnlen(const ucs2_char_t *s, size_t maxlength)
+{
+        unsigned long length = 0;
+
+        while (*s++ != 0 && length < maxlength)
+                length++;
+        return length;
+}
+EXPORT_SYMBOL(ucs2_strnlen);
+
+unsigned long
+ucs2_strlen(const ucs2_char_t *s)
+{
+        return ucs2_strnlen(s, ~0UL);
+}
+EXPORT_SYMBOL(ucs2_strlen);
+
+/*
+ * Return the number of bytes is the length of this string
+ * Note: this is NOT the same as the number of unicode characters
+ */
+unsigned long
+ucs2_strsize(const ucs2_char_t *data, unsigned long maxlength)
+{
+        return ucs2_strnlen(data, maxlength/sizeof(ucs2_char_t)) * sizeof(ucs2_char_t);
+}
+EXPORT_SYMBOL(ucs2_strsize);
+
+int
+ucs2_strncmp(const ucs2_char_t *a, const ucs2_char_t *b, size_t len)
+{
+        while (1) {
+                if (len == 0)
+                        return 0;
+                if (*a < *b)
+                        return -1;
+                if (*a > *b)
+                        return 1;
+                if (*a == 0) /* implies *b == 0 */
+                        return 0;
+                a++;
+                b++;
+                len--;
+        }
+}
+EXPORT_SYMBOL(ucs2_strncmp);
-- 
cgit v1.2.3


From c729de8fcea37a1c444e81857eace12494c804a9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 15 Apr 2013 22:23:45 -0700
Subject: x86, kdump: Set crashkernel_low automatically

Chao said that kdump does does work well on his system on 3.8
without extra parameter, even iommu does not work with kdump.
And now have to append crashkernel_low=Y in first kernel to make
kdump work.

We have now modified crashkernel=X to allocate memory beyong 4G (if
available) and do not allocate low range for crashkernel if the user
does not specify that with crashkernel_low=Y.  This causes regression
if iommu is not enabled.  Without iommu, swiotlb needs to be setup in
first 4G and there is no low memory available to second kernel.

Set crashkernel_low automatically if the user does not specify that.

For system that does support IOMMU with kdump properly, user could
specify crashkernel_low=0 to save that 72M low ram.

-v3: add swiotlb_size() according to Konrad.
-v4: add comments what 8M is for according to hpa.
     also update more crashkernel_low= in kernel-parameters.txt
-v5: update changelog according to Vivek.
-v6: Change description about swiotlb referring according to HATAYAMA.

Reported-by: WANG Chao <chaowang@redhat.com>
Tested-by: WANG Chao <chaowang@redhat.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1366089828-19692-2-git-send-email-yinghai@kernel.org
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 lib/swiotlb.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'lib')

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index bfe02b8fc55b..d23762e6652c 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -105,9 +105,9 @@ setup_io_tlb_npages(char *str)
 	if (!strcmp(str, "force"))
 		swiotlb_force = 1;
 
-	return 1;
+	return 0;
 }
-__setup("swiotlb=", setup_io_tlb_npages);
+early_param("swiotlb", setup_io_tlb_npages);
 /* make io_tlb_overflow tunable too? */
 
 unsigned long swiotlb_nr_tbl(void)
@@ -115,6 +115,18 @@ unsigned long swiotlb_nr_tbl(void)
 	return io_tlb_nslabs;
 }
 EXPORT_SYMBOL_GPL(swiotlb_nr_tbl);
+
+/* default to 64MB */
+#define IO_TLB_DEFAULT_SIZE (64UL<<20)
+unsigned long swiotlb_size_or_default(void)
+{
+	unsigned long size;
+
+	size = io_tlb_nslabs << IO_TLB_SHIFT;
+
+	return size ? size : (IO_TLB_DEFAULT_SIZE);
+}
+
 /* Note that this doesn't work with highmem page */
 static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
 				      volatile void *address)
@@ -188,8 +200,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 void  __init
 swiotlb_init(int verbose)
 {
-	/* default to 64MB */
-	size_t default_size = 64UL<<20;
+	size_t default_size = IO_TLB_DEFAULT_SIZE;
 	unsigned char *vstart;
 	unsigned long bytes;
 
-- 
cgit v1.2.3


From 4b59e6c4730978679b414a8da61514a2518da512 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 29 Apr 2013 15:06:11 -0700
Subject: mm, show_mem: suppress page counts in non-blockable contexts

On large systems with a lot of memory, walking all RAM to determine page
types may take a half second or even more.

In non-blockable contexts, the page allocator will emit a page allocation
failure warning unless __GFP_NOWARN is specified.  In such contexts, irqs
are typically disabled and such a lengthy delay may even result in NMI
watchdog timeouts.

To fix this, suppress the page walk in such contexts when printing the
page allocation failure warning.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/show_mem.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'lib')

diff --git a/lib/show_mem.c b/lib/show_mem.c
index 4407f8c9b1f7..b7c72311ad0c 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -18,6 +18,9 @@ void show_mem(unsigned int filter)
 	printk("Mem-Info:\n");
 	show_free_areas(filter);
 
+	if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+		return;
+
 	for_each_online_pgdat(pgdat) {
 		unsigned long i, flags;
 
-- 
cgit v1.2.3


From 9375db07adeaeea5f5ea7ca0463a8b371d71ddbb Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Mon, 29 Apr 2013 16:17:10 -0700
Subject: genalloc: add devres support, allow to find a managed pool by device

This patch adds three exported functions to lib/genalloc.c:
devm_gen_pool_create, dev_get_gen_pool, and of_get_named_gen_pool.

devm_gen_pool_create is a managed version of gen_pool_create that keeps
track of the pool via devres and allows the management code to
automatically destroy it after device removal.

dev_get_gen_pool retrieves the gen_pool for a given device, if it was
created with devm_gen_pool_create, using devres_find.

of_get_named_gen_pool retrieves the gen_pool for a given device node and
property name, where the property must contain a phandle pointing to a
platform device node.  The corresponding platform device is then fed into
dev_get_gen_pool and the resulting gen_pool is returned.

[akpm@linux-foundation.org: make the of_get_named_gen_pool() stub static, fixing a zillion link errors]
[akpm@linux-foundation.org: squish "struct device declared inside parameter list" warning]
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Tested-by: Michal Simek <monstr@monstr.eu>
Cc: Fabio Estevam <fabio.estevam@freescale.com>
Cc: Matt Porter <mporter@ti.com>
Cc: Dong Aisheng <dong.aisheng@linaro.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Javier Martin <javier.martin@vista-silicon.com>
Cc: Huang Shijie <shijie8@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/genalloc.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

(limited to 'lib')

diff --git a/lib/genalloc.c b/lib/genalloc.c
index 54920433705a..b35cfa9bc3d4 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -34,6 +34,8 @@
 #include <linux/rculist.h>
 #include <linux/interrupt.h>
 #include <linux/genalloc.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
 
 static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set)
 {
@@ -480,3 +482,82 @@ unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
 	return start_bit;
 }
 EXPORT_SYMBOL(gen_pool_best_fit);
+
+static void devm_gen_pool_release(struct device *dev, void *res)
+{
+	gen_pool_destroy(*(struct gen_pool **)res);
+}
+
+/**
+ * devm_gen_pool_create - managed gen_pool_create
+ * @dev: device that provides the gen_pool
+ * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
+ * @nid: node id of the node the pool structure should be allocated on, or -1
+ *
+ * Create a new special memory pool that can be used to manage special purpose
+ * memory not managed by the regular kmalloc/kfree interface. The pool will be
+ * automatically destroyed by the device management code.
+ */
+struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
+		int nid)
+{
+	struct gen_pool **ptr, *pool;
+
+	ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL);
+
+	pool = gen_pool_create(min_alloc_order, nid);
+	if (pool) {
+		*ptr = pool;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return pool;
+}
+
+/**
+ * dev_get_gen_pool - Obtain the gen_pool (if any) for a device
+ * @dev: device to retrieve the gen_pool from
+ * @name: Optional name for the gen_pool, usually NULL
+ *
+ * Returns the gen_pool for the device if one is present, or NULL.
+ */
+struct gen_pool *dev_get_gen_pool(struct device *dev)
+{
+	struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL,
+					NULL);
+
+	if (!p)
+		return NULL;
+	return *p;
+}
+EXPORT_SYMBOL_GPL(dev_get_gen_pool);
+
+#ifdef CONFIG_OF
+/**
+ * of_get_named_gen_pool - find a pool by phandle property
+ * @np: device node
+ * @propname: property name containing phandle(s)
+ * @index: index into the phandle array
+ *
+ * Returns the pool that contains the chunk starting at the physical
+ * address of the device tree node pointed at by the phandle property,
+ * or NULL if not found.
+ */
+struct gen_pool *of_get_named_gen_pool(struct device_node *np,
+	const char *propname, int index)
+{
+	struct platform_device *pdev;
+	struct device_node *np_pool;
+
+	np_pool = of_parse_phandle(np, propname, index);
+	if (!np_pool)
+		return NULL;
+	pdev = of_find_device_by_node(np_pool);
+	if (!pdev)
+		return NULL;
+	return dev_get_gen_pool(&pdev->dev);
+}
+EXPORT_SYMBOL_GPL(of_get_named_gen_pool);
+#endif /* CONFIG_OF */
-- 
cgit v1.2.3


From 30493cc9dddb68066dcc4878015660fdaa8e0965 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Mon, 29 Apr 2013 16:18:09 -0700
Subject: lib/int_sqrt.c: optimize square root algorithm

Optimize the current version of the shift-and-subtract (hardware)
algorithm, described by John von Newmann[1] and Guy L Steele.

Iterating 1,000,000 times, perf shows for the current version:

 Performance counter stats for './sqrt-curr' (10 runs):

         27.170996 task-clock                #    0.979 CPUs utilized            ( +-  3.19% )
                 3 context-switches          #    0.103 K/sec                    ( +-  4.76% )
                 0 cpu-migrations            #    0.004 K/sec                    ( +-100.00% )
               104 page-faults               #    0.004 M/sec                    ( +-  0.16% )
        64,921,199 cycles                    #    2.389 GHz                      ( +-  0.03% )
        28,967,789 stalled-cycles-frontend   #   44.62% frontend cycles idle     ( +-  0.18% )
   <not supported> stalled-cycles-backend
       104,502,623 instructions              #    1.61  insns per cycle
                                             #    0.28  stalled cycles per insn  ( +-  0.00% )
        34,088,368 branches                  # 1254.587 M/sec                    ( +-  0.00% )
             4,901 branch-misses             #    0.01% of all branches          ( +-  1.32% )

       0.027763015 seconds time elapsed                                          ( +-  3.22% )

And for the new version:

Performance counter stats for './sqrt-new' (10 runs):

          0.496869 task-clock                #    0.519 CPUs utilized            ( +-  2.38% )
                 0 context-switches          #    0.000 K/sec
                 0 cpu-migrations            #    0.403 K/sec                    ( +-100.00% )
               104 page-faults               #    0.209 M/sec                    ( +-  0.15% )
           590,760 cycles                    #    1.189 GHz                      ( +-  2.35% )
           395,053 stalled-cycles-frontend   #   66.87% frontend cycles idle     ( +-  3.67% )
   <not supported> stalled-cycles-backend
           398,963 instructions              #    0.68  insns per cycle
                                             #    0.99  stalled cycles per insn  ( +-  0.39% )
            70,228 branches                  #  141.341 M/sec                    ( +-  0.36% )
             3,364 branch-misses             #    4.79% of all branches          ( +-  5.45% )

       0.000957440 seconds time elapsed                                          ( +-  2.42% )

Furthermore, this saves space in instruction text:

   text    data     bss     dec     hex filename
    111       0       0     111      6f lib/int_sqrt-baseline.o
     89       0       0      89      59 lib/int_sqrt.o

[1] http://en.wikipedia.org/wiki/First_Draft_of_a_Report_on_the_EDVAC

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Reviewed-by: Jonathan Gonzalez <jgonzlez@linets.cl>
Tested-by: Jonathan Gonzalez <jgonzlez@linets.cl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/int_sqrt.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

(limited to 'lib')

diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c
index fc2eeb7cb2ea..1ef4cc344977 100644
--- a/lib/int_sqrt.c
+++ b/lib/int_sqrt.c
@@ -1,3 +1,9 @@
+/*
+ * Copyright (C) 2013 Davidlohr Bueso <davidlohr.bueso@hp.com>
+ *
+ *  Based on the shift-and-subtract algorithm for computing integer
+ *  square root from Guy L. Steele.
+ */
 
 #include <linux/kernel.h>
 #include <linux/export.h>
@@ -10,23 +16,23 @@
  */
 unsigned long int_sqrt(unsigned long x)
 {
-	unsigned long op, res, one;
+	unsigned long b, m, y = 0;
 
-	op = x;
-	res = 0;
+	if (x <= 1)
+		return x;
 
-	one = 1UL << (BITS_PER_LONG - 2);
-	while (one > op)
-		one >>= 2;
+	m = 1UL << (BITS_PER_LONG - 2);
+	while (m != 0) {
+		b = y + m;
+		y >>= 1;
 
-	while (one != 0) {
-		if (op >= res + one) {
-			op = op - (res + one);
-			res = res +  2 * one;
+		if (x >= b) {
+			x -= b;
+			y += m;
 		}
-		res /= 2;
-		one /= 4;
+		m >>= 2;
 	}
-	return res;
+
+	return y;
 }
 EXPORT_SYMBOL(int_sqrt);
-- 
cgit v1.2.3


From 095d141b2e40665289d44a42d387ffac2841ef82 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 29 Apr 2013 16:18:10 -0700
Subject: argv_split(): teach it to handle mutable strings

argv_split() allocates argv[count_argc(str)] array and assumes that it
will find the same number of arguments later.  This is obviously wrong if
this string can be changed, say, by sysctl.

With this patch argv_split() kstrndup's the whole string and does not
split it, we simply replace the spaces with zeroes and keep the allocated
memory in argv[-1] for argv_free(arg).

We do not use argv[0] because:

	- str can be all-spaces or empty. In fact this case is fine,
	  we could kfree() it before return, but:

	- str can have a space at the start, and we can not rely on
	  kstrndup(skip_spaces(str)) because it can equally race if
	  this string is mutable.

Also, simplify count_argc() and kill the no longer used skip_arg().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/argv_split.c | 87 ++++++++++++++++++++++++++------------------------------
 1 file changed, 40 insertions(+), 47 deletions(-)

(limited to 'lib')

diff --git a/lib/argv_split.c b/lib/argv_split.c
index 1e9a6cbc3689..e927ed0e18a8 100644
--- a/lib/argv_split.c
+++ b/lib/argv_split.c
@@ -8,23 +8,17 @@
 #include <linux/slab.h>
 #include <linux/export.h>
 
-static const char *skip_arg(const char *cp)
-{
-	while (*cp && !isspace(*cp))
-		cp++;
-
-	return cp;
-}
-
 static int count_argc(const char *str)
 {
 	int count = 0;
+	bool was_space;
 
-	while (*str) {
-		str = skip_spaces(str);
-		if (*str) {
+	for (was_space = true; *str; str++) {
+		if (isspace(*str)) {
+			was_space = true;
+		} else if (was_space) {
+			was_space = false;
 			count++;
-			str = skip_arg(str);
 		}
 	}
 
@@ -39,10 +33,8 @@ static int count_argc(const char *str)
  */
 void argv_free(char **argv)
 {
-	char **p;
-	for (p = argv; *p; p++)
-		kfree(*p);
-
+	argv--;
+	kfree(argv[0]);
 	kfree(argv);
 }
 EXPORT_SYMBOL(argv_free);
@@ -59,43 +51,44 @@ EXPORT_SYMBOL(argv_free);
  * considered to be a single argument separator.  The returned array
  * is always NULL-terminated.  Returns NULL on memory allocation
  * failure.
+ *
+ * The source string at `str' may be undergoing concurrent alteration via
+ * userspace sysctl activity (at least).  The argv_split() implementation
+ * attempts to handle this gracefully by taking a local copy to work on.
  */
 char **argv_split(gfp_t gfp, const char *str, int *argcp)
 {
-	int argc = count_argc(str);
-	char **argv = kzalloc(sizeof(*argv) * (argc+1), gfp);
-	char **argvp;
-
-	if (argv == NULL)
-		goto out;
-
-	if (argcp)
-		*argcp = argc;
-
-	argvp = argv;
-
-	while (*str) {
-		str = skip_spaces(str);
-
-		if (*str) {
-			const char *p = str;
-			char *t;
-
-			str = skip_arg(str);
+	char *argv_str;
+	bool was_space;
+	char **argv, **argv_ret;
+	int argc;
+
+	argv_str = kstrndup(str, KMALLOC_MAX_SIZE - 1, gfp);
+	if (!argv_str)
+		return NULL;
+
+	argc = count_argc(argv_str);
+	argv = kmalloc(sizeof(*argv) * (argc + 2), gfp);
+	if (!argv) {
+		kfree(argv_str);
+		return NULL;
+	}
 
-			t = kstrndup(p, str-p, gfp);
-			if (t == NULL)
-				goto fail;
-			*argvp++ = t;
+	*argv = argv_str;
+	argv_ret = ++argv;
+	for (was_space = true; *argv_str; argv_str++) {
+		if (isspace(*argv_str)) {
+			was_space = true;
+			*argv_str = 0;
+		} else if (was_space) {
+			was_space = false;
+			*argv++ = argv_str;
 		}
 	}
-	*argvp = NULL;
-
-  out:
-	return argv;
+	*argv = NULL;
 
-  fail:
-	argv_free(argv);
-	return NULL;
+	if (argcp)
+		*argcp = argc;
+	return argv_ret;
 }
 EXPORT_SYMBOL(argv_split);
-- 
cgit v1.2.3


From 2e0fb404c86d6c86dc33e284310eb5d28d192dcf Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 29 Apr 2013 16:18:11 -0700
Subject: lib, net: make isodigit() public and use it

There are at least two users of isodigit().  Let's make it a public
function of ctype.h.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/dynamic_debug.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'lib')

diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 5276b99ca650..46032453abd5 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -281,7 +281,6 @@ static inline int parse_lineno(const char *str, unsigned int *val)
  * allow the user to express a query which matches a format
  * containing embedded spaces.
  */
-#define isodigit(c)		((c) >= '0' && (c) <= '7')
 static char *unescape(char *str)
 {
 	char *in = str;
-- 
cgit v1.2.3


From 3e6628c4b347a558965041290c5a92791dd4c741 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 29 Apr 2013 16:21:16 -0700
Subject: idr: introduce idr_alloc_cyclic()

As Tejun points out, there are several users of the IDR facility that
attempt to use it in a cyclic fashion.  These users are likely to see
-ENOSPC errors after the counter wraps one or more times however.

This patchset adds a new idr_alloc_cyclic routine and converts several
of these users to it.  Many of these users are in obscure parts of the
kernel, and I don't have a good way to test some of them.  The change is
pretty straightforward though, so hopefully it won't be an issue.

There is one other cyclic user of idr_alloc that I didn't touch in
ipc/util.c.  That one is doing some strange stuff that I didn't quite
understand, but it looks like it should probably be converted later
somehow.

This patch:

Thus spake Tejun Heo:

    Ooh, BTW, the cyclic allocation is broken.  It's prone to -ENOSPC
    after the first wraparound.  There are several cyclic users in the
    kernel and I think it probably would be best to implement cyclic
    support in idr.

This patch does that by adding new idr_alloc_cyclic function that such
users in the kernel can use.  With this, there's no need for a caller to
keep track of the last value used as that's now tracked internally.  This
should prevent the ENOSPC problems that can hit when the "last allocated"
counter exceeds INT_MAX.

Later patches will convert existing cyclic users to the new interface.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Jack Morgenstein <jackm@dev.mellanox.co.il>
Cc: John McCutchan <john@johnmccutchan.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Or Gerlitz <ogerlitz@mellanox.com>
Cc: Robert Love <rlove@rlove.org>
Cc: Roland Dreier <roland@purestorage.com>
Cc: Sridhar Samudrala <sri@us.ibm.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Tom Tucker <tom@opengridcomputing.com>
Cc: Vlad Yasevich <vyasevich@gmail.com>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/idr.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'lib')

diff --git a/lib/idr.c b/lib/idr.c
index 322e2816f2fb..cca4b9302a71 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -495,6 +495,33 @@ int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL_GPL(idr_alloc);
 
+/**
+ * idr_alloc_cyclic - allocate new idr entry in a cyclical fashion
+ * @idr: the (initialized) idr
+ * @ptr: pointer to be associated with the new id
+ * @start: the minimum id (inclusive)
+ * @end: the maximum id (exclusive, <= 0 for max)
+ * @gfp_mask: memory allocation flags
+ *
+ * Essentially the same as idr_alloc, but prefers to allocate progressively
+ * higher ids if it can. If the "cur" counter wraps, then it will start again
+ * at the "start" end of the range and allocate one that has already been used.
+ */
+int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end,
+			gfp_t gfp_mask)
+{
+	int id;
+
+	id = idr_alloc(idr, ptr, max(start, idr->cur), end, gfp_mask);
+	if (id == -ENOSPC)
+		id = idr_alloc(idr, ptr, start, end, gfp_mask);
+
+	if (likely(id >= 0))
+		idr->cur = id + 1;
+	return id;
+}
+EXPORT_SYMBOL(idr_alloc_cyclic);
+
 static void idr_remove_warning(int id)
 {
 	printk(KERN_WARNING
-- 
cgit v1.2.3


From cedddb00023b1321f2bb2739739aa8c9927fc063 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 29 Apr 2013 16:21:25 -0700
Subject: uuid: use prandom_bytes()

Use prandom_bytes() to generate 16 bytes of pseudo-random bytes.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/uuid.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'lib')

diff --git a/lib/uuid.c b/lib/uuid.c
index 52a6fe6387de..398821e4dce1 100644
--- a/lib/uuid.c
+++ b/lib/uuid.c
@@ -25,13 +25,7 @@
 
 static void __uuid_gen_common(__u8 b[16])
 {
-	int i;
-	u32 r;
-
-	for (i = 0; i < 4; i++) {
-		r = random32();
-		memcpy(b + i * 4, &r, 4);
-	}
+	prandom_bytes(b, 16);
 	/* reversion 0b10 */
 	b[8] = (b[8] & 0x3F) | 0x80;
 }
-- 
cgit v1.2.3


From f39fee5f11f7adfcf5c9643f87718b80450be18a Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 29 Apr 2013 16:21:28 -0700
Subject: lib/: rename random32() to prandom_u32()

Use preferable function name which implies using a pseudo-random
number generator.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/fault-inject.c | 2 +-
 lib/list_sort.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index f7210ad6cffd..c5c7a762b850 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -122,7 +122,7 @@ bool should_fail(struct fault_attr *attr, ssize_t size)
 			return false;
 	}
 
-	if (attr->probability <= random32() % 100)
+	if (attr->probability <= prandom_u32() % 100)
 		return false;
 
 	if (!fail_stacktrace(attr))
diff --git a/lib/list_sort.c b/lib/list_sort.c
index d7325c6b103f..1183fa70a44d 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -229,7 +229,7 @@ static int __init list_sort_test(void)
 			goto exit;
 		}
 		 /* force some equivalencies */
-		el->value = random32() % (TEST_LIST_LEN/3);
+		el->value = prandom_u32() % (TEST_LIST_LEN / 3);
 		el->serial = i;
 		el->poison1 = TEST_POISON1;
 		el->poison2 = TEST_POISON2;
-- 
cgit v1.2.3


From f3002134158092178be81339ec5a22ff80e6c308 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 30 Apr 2013 11:35:07 +0200
Subject: Revert "math64: New div64_u64_rem helper"

This reverts commit f792685006274a850e6cc0ea9ade275ccdfc90bc.

The cputime scaling code was changed/fixed and does not need the
div64_u64_rem() primitive anymore. It has no other users, so let's
remove them.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: rostedt@goodmis.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Dave Hansen <dave@sr71.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1367314507-9728-4-git-send-email-sgruszka@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 lib/div64.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

(limited to 'lib')

diff --git a/lib/div64.c b/lib/div64.c
index 3af5728d95fd..a163b6caef73 100644
--- a/lib/div64.c
+++ b/lib/div64.c
@@ -79,10 +79,9 @@ EXPORT_SYMBOL(div_s64_rem);
 #endif
 
 /**
- * div64_u64_rem - unsigned 64bit divide with 64bit divisor and 64bit remainder
+ * div64_u64 - unsigned 64bit divide with 64bit divisor
  * @dividend:	64bit dividend
  * @divisor:	64bit divisor
- * @remainder:  64bit remainder
  *
  * This implementation is a modified version of the algorithm proposed
  * by the book 'Hacker's Delight'.  The original source and full proof
@@ -90,33 +89,27 @@ EXPORT_SYMBOL(div_s64_rem);
  *
  * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c.txt'
  */
-#ifndef div64_u64_rem
-u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
+#ifndef div64_u64
+u64 div64_u64(u64 dividend, u64 divisor)
 {
 	u32 high = divisor >> 32;
 	u64 quot;
 
 	if (high == 0) {
-		u32 rem32;
-		quot = div_u64_rem(dividend, divisor, &rem32);
-		*remainder = rem32;
+		quot = div_u64(dividend, divisor);
 	} else {
 		int n = 1 + fls(high);
 		quot = div_u64(dividend >> n, divisor >> n);
 
 		if (quot != 0)
 			quot--;
-
-		*remainder = dividend - quot * divisor;
-		if (*remainder >= divisor) {
+		if ((dividend - quot * divisor) >= divisor)
 			quot++;
-			*remainder -= divisor;
-		}
 	}
 
 	return quot;
 }
-EXPORT_SYMBOL(div64_u64_rem);
+EXPORT_SYMBOL(div64_u64);
 #endif
 
 /**
-- 
cgit v1.2.3


From b0d33c2bd77bcf2d7c9427d2361ac57fe5b33aa1 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 12 Dec 2012 10:18:50 -0800
Subject: vsprintf: Add extension %pSR - print_symbol replacement

print_symbol takes a long and converts it to a function
name and offset.  %pS does something similar, but doesn't
translate the address via __builtin_extract_return_addr.
%pSR does the translation.

This will enable replacing multiple calls like
	printk(...);
	printk_symbol(addr);
	printk("\n");
with a single non-interleavable in dmesg
	printk("... %pSR\n", (void *)addr);

Update documentation too.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 lib/vsprintf.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'lib')

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 0d62fd700f68..e149c6416384 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -534,14 +534,21 @@ char *string(char *buf, char *end, const char *s, struct printf_spec spec)
 
 static noinline_for_stack
 char *symbol_string(char *buf, char *end, void *ptr,
-		    struct printf_spec spec, char ext)
+		    struct printf_spec spec, const char *fmt)
 {
-	unsigned long value = (unsigned long) ptr;
+	unsigned long value;
 #ifdef CONFIG_KALLSYMS
 	char sym[KSYM_SYMBOL_LEN];
-	if (ext == 'B')
+#endif
+
+	if (fmt[1] == 'R')
+		ptr = __builtin_extract_return_addr(ptr);
+	value = (unsigned long)ptr;
+
+#ifdef CONFIG_KALLSYMS
+	if (*fmt == 'B')
 		sprint_backtrace(sym, value);
-	else if (ext != 'f' && ext != 's')
+	else if (*fmt != 'f' && *fmt != 's')
 		sprint_symbol(sym, value);
 	else
 		sprint_symbol_no_offset(sym, value);
@@ -987,6 +994,7 @@ int kptr_restrict __read_mostly;
  * - 'f' For simple symbolic function names without offset
  * - 'S' For symbolic direct pointers with offset
  * - 's' For symbolic direct pointers without offset
+ * - '[FfSs]R' as above with __builtin_extract_return_addr() translation
  * - 'B' For backtraced symbolic direct pointers with offset
  * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref]
  * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201]
@@ -1060,7 +1068,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
 	case 'S':
 	case 's':
 	case 'B':
-		return symbol_string(buf, end, ptr, spec, *fmt);
+		return symbol_string(buf, end, ptr, spec, fmt);
 	case 'R':
 	case 'r':
 		return resource_string(buf, end, ptr, spec, fmt);
-- 
cgit v1.2.3


From 196779b9b4ce1922afabdc20d0270720603bd46c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 30 Apr 2013 15:27:12 -0700
Subject: dump_stack: consolidate dump_stack() implementations and unify their
 behaviors

Both dump_stack() and show_stack() are currently implemented by each
architecture.  show_stack(NULL, NULL) dumps the backtrace for the
current task as does dump_stack().  On some archs, dump_stack() prints
extra information - pid, utsname and so on - in addition to the
backtrace while the two are identical on other archs.

The usages in arch-independent code of the two functions indicate
show_stack(NULL, NULL) should print out bare backtrace while
dump_stack() is used for debugging purposes when something went wrong,
so it does make sense to print additional information on the task which
triggered dump_stack().

There's no reason to require archs to implement two separate but mostly
identical functions.  It leads to unnecessary subtle information.

This patch expands the dummy fallback dump_stack() implementation in
lib/dump_stack.c such that it prints out debug information (taken from
x86) and invokes show_stack(NULL, NULL) and drops arch-specific
dump_stack() implementations in all archs except blackfin.  Blackfin's
dump_stack() does something wonky that I don't understand.

Debug information can be printed separately by calling
dump_stack_print_info() so that arch-specific dump_stack()
implementation can still emit the same debug information.  This is used
in blackfin.

This patch brings the following behavior changes.

* On some archs, an extra level in backtrace for show_stack() could be
  printed.  This is because the top frame was determined in
  dump_stack() on those archs while generic dump_stack() can't do that
  reliably.  It can be compensated by inlining dump_stack() but not
  sure whether that'd be necessary.

* Most archs didn't use to print debug info on dump_stack().  They do
  now.

An example WARN dump follows.

 WARNING: at kernel/workqueue.c:4841 init_workqueues+0x35/0x505()
 Hardware name: empty
 Modules linked in:
 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.9.0-rc1-work+ #9
  0000000000000009 ffff88007c861e08 ffffffff81c614dc ffff88007c861e48
  ffffffff8108f50f ffffffff82228240 0000000000000040 ffffffff8234a03c
  0000000000000000 0000000000000000 0000000000000000 ffff88007c861e58
 Call Trace:
  [<ffffffff81c614dc>] dump_stack+0x19/0x1b
  [<ffffffff8108f50f>] warn_slowpath_common+0x7f/0xc0
  [<ffffffff8108f56a>] warn_slowpath_null+0x1a/0x20
  [<ffffffff8234a071>] init_workqueues+0x35/0x505
  ...

v2: CPU number added to the generic debug info as requested by s390
    folks and dropped the s390 specific dump_stack().  This loses %ksp
    from the debug message which the maintainers think isn't important
    enough to keep the s390-specific dump_stack() implementation.

    dump_stack_print_info() is moved to kernel/printk.c from
    lib/dump_stack.c.  Because linkage is per objecct file,
    dump_stack_print_info() living in the same lib file as generic
    dump_stack() means that archs which implement custom dump_stack()
    - at this point, only blackfin - can't use dump_stack_print_info()
    as that will bring in the generic version of dump_stack() too.  v1
    The v1 patch broke build on blackfin due to this issue.  The build
    breakage was reported by Fengguang Wu.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Vineet Gupta <vgupta@synopsys.com>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: Vineet Gupta <vgupta@synopsys.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>	[s390 bits]
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Acked-by: Richard Kuo <rkuo@codeaurora.org>		[hexagon bits]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/dump_stack.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'lib')

diff --git a/lib/dump_stack.c b/lib/dump_stack.c
index 42f4f55c9458..53bad099ebd6 100644
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -5,11 +5,16 @@
 
 #include <linux/kernel.h>
 #include <linux/export.h>
+#include <linux/sched.h>
 
+/**
+ * dump_stack - dump the current task information and its stack trace
+ *
+ * Architectures can override this implementation by implementing its own.
+ */
 void dump_stack(void)
 {
-	printk(KERN_NOTICE
-		"This architecture does not implement dump_stack()\n");
+	dump_stack_print_info(KERN_DEFAULT);
+	show_stack(NULL, NULL);
 }
-
 EXPORT_SYMBOL(dump_stack);
-- 
cgit v1.2.3


From 16c7fa05829e8b91db48e3539c5d6ff3c2b18a23 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 30 Apr 2013 15:27:30 -0700
Subject: lib/string_helpers: introduce generic string_unescape

There are several places in kernel where modules unescapes input to convert
C-Style Escape Sequences into byte codes.

The patch provides generic implementation of such approach. Test cases are
also included into the patch.

[akpm@linux-foundation.org: clarify comment]
[akpm@linux-foundation.org: export get_random_int() to modules]
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Samuel Thibault <samuel.thibault@ens-lyon.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: William Hubbs <w.d.hubbs@gmail.com>
Cc: Chris Brannon <chris@the-brannons.com>
Cc: Kirk Reiser <kirk@braille.uwo.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Kconfig.debug         |   3 ++
 lib/Makefile              |   4 +-
 lib/string_helpers.c      | 133 ++++++++++++++++++++++++++++++++++++++++++++++
 lib/test-string_helpers.c | 103 +++++++++++++++++++++++++++++++++++
 4 files changed, 242 insertions(+), 1 deletion(-)
 create mode 100644 lib/test-string_helpers.c

(limited to 'lib')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 28be08c09bab..77ebaa3dfa12 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1463,5 +1463,8 @@ source "lib/Kconfig.kgdb"
 
 source "lib/Kconfig.kmemcheck"
 
+config TEST_STRING_HELPERS
+	tristate "Test functions located in the string_helpers module at runtime"
+
 config TEST_KSTRTOX
 	tristate "Test kstrto*() family of functions at runtime"
diff --git a/lib/Makefile b/lib/Makefile
index 6e2cc561f761..23c9a0fe74fc 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -22,8 +22,10 @@ lib-y	+= kobject.o klist.o
 
 obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
-	 string_helpers.o gcd.o lcm.o list_sort.o uuid.o flex_array.o \
+	 gcd.o lcm.o list_sort.o uuid.o flex_array.o \
 	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o
+obj-y += string_helpers.o
+obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += kstrtox.o
 obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
 
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 1cffc223bff5..ed5c1454dd62 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -2,10 +2,12 @@
  * Helpers for formatting and printing strings
  *
  * Copyright 31 August 2008 James Bottomley
+ * Copyright (C) 2013, Intel Corporation
  */
 #include <linux/kernel.h>
 #include <linux/math64.h>
 #include <linux/export.h>
+#include <linux/ctype.h>
 #include <linux/string_helpers.h>
 
 /**
@@ -66,3 +68,134 @@ int string_get_size(u64 size, const enum string_size_units units,
 	return 0;
 }
 EXPORT_SYMBOL(string_get_size);
+
+static bool unescape_space(char **src, char **dst)
+{
+	char *p = *dst, *q = *src;
+
+	switch (*q) {
+	case 'n':
+		*p = '\n';
+		break;
+	case 'r':
+		*p = '\r';
+		break;
+	case 't':
+		*p = '\t';
+		break;
+	case 'v':
+		*p = '\v';
+		break;
+	case 'f':
+		*p = '\f';
+		break;
+	default:
+		return false;
+	}
+	*dst += 1;
+	*src += 1;
+	return true;
+}
+
+static bool unescape_octal(char **src, char **dst)
+{
+	char *p = *dst, *q = *src;
+	u8 num;
+
+	if (isodigit(*q) == 0)
+		return false;
+
+	num = (*q++) & 7;
+	while (num < 32 && isodigit(*q) && (q - *src < 3)) {
+		num <<= 3;
+		num += (*q++) & 7;
+	}
+	*p = num;
+	*dst += 1;
+	*src = q;
+	return true;
+}
+
+static bool unescape_hex(char **src, char **dst)
+{
+	char *p = *dst, *q = *src;
+	int digit;
+	u8 num;
+
+	if (*q++ != 'x')
+		return false;
+
+	num = digit = hex_to_bin(*q++);
+	if (digit < 0)
+		return false;
+
+	digit = hex_to_bin(*q);
+	if (digit >= 0) {
+		q++;
+		num = (num << 4) | digit;
+	}
+	*p = num;
+	*dst += 1;
+	*src = q;
+	return true;
+}
+
+static bool unescape_special(char **src, char **dst)
+{
+	char *p = *dst, *q = *src;
+
+	switch (*q) {
+	case '\"':
+		*p = '\"';
+		break;
+	case '\\':
+		*p = '\\';
+		break;
+	case 'a':
+		*p = '\a';
+		break;
+	case 'e':
+		*p = '\e';
+		break;
+	default:
+		return false;
+	}
+	*dst += 1;
+	*src += 1;
+	return true;
+}
+
+int string_unescape(char *src, char *dst, size_t size, unsigned int flags)
+{
+	char *out = dst;
+
+	while (*src && --size) {
+		if (src[0] == '\\' && src[1] != '\0' && size > 1) {
+			src++;
+			size--;
+
+			if (flags & UNESCAPE_SPACE &&
+					unescape_space(&src, &out))
+				continue;
+
+			if (flags & UNESCAPE_OCTAL &&
+					unescape_octal(&src, &out))
+				continue;
+
+			if (flags & UNESCAPE_HEX &&
+					unescape_hex(&src, &out))
+				continue;
+
+			if (flags & UNESCAPE_SPECIAL &&
+					unescape_special(&src, &out))
+				continue;
+
+			*out++ = '\\';
+		}
+		*out++ = *src++;
+	}
+	*out = '\0';
+
+	return out - dst;
+}
+EXPORT_SYMBOL(string_unescape);
diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c
new file mode 100644
index 000000000000..6ac48de04c0e
--- /dev/null
+++ b/lib/test-string_helpers.c
@@ -0,0 +1,103 @@
+/*
+ * Test cases for lib/string_helpers.c module.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/string_helpers.h>
+
+struct test_string {
+	const char *in;
+	const char *out;
+	unsigned int flags;
+};
+
+static const struct test_string strings[] __initconst = {
+	{
+		.in = "\\f\\ \\n\\r\\t\\v",
+		.out = "\f\\ \n\r\t\v",
+		.flags = UNESCAPE_SPACE,
+	},
+	{
+		.in = "\\40\\1\\387\\0064\\05\\040\\8a\\110\\777",
+		.out = " \001\00387\0064\005 \\8aH?7",
+		.flags = UNESCAPE_OCTAL,
+	},
+	{
+		.in = "\\xv\\xa\\x2c\\xD\\x6f2",
+		.out = "\\xv\n,\ro2",
+		.flags = UNESCAPE_HEX,
+	},
+	{
+		.in = "\\h\\\\\\\"\\a\\e\\",
+		.out = "\\h\\\"\a\e\\",
+		.flags = UNESCAPE_SPECIAL,
+	},
+};
+
+static void __init test_string_unescape(unsigned int flags, bool inplace)
+{
+	char in[256];
+	char out_test[256];
+	char out_real[256];
+	int i, p = 0, q_test = 0, q_real = sizeof(out_real);
+
+	for (i = 0; i < ARRAY_SIZE(strings); i++) {
+		const char *s = strings[i].in;
+		int len = strlen(strings[i].in);
+
+		/* Copy string to in buffer */
+		memcpy(&in[p], s, len);
+		p += len;
+
+		/* Copy expected result for given flags */
+		if (flags & strings[i].flags) {
+			s = strings[i].out;
+			len = strlen(strings[i].out);
+		}
+		memcpy(&out_test[q_test], s, len);
+		q_test += len;
+	}
+	in[p++] = '\0';
+
+	/* Call string_unescape and compare result */
+	if (inplace) {
+		memcpy(out_real, in, p);
+		if (flags == UNESCAPE_ANY)
+			q_real = string_unescape_any_inplace(out_real);
+		else
+			q_real = string_unescape_inplace(out_real, flags);
+	} else if (flags == UNESCAPE_ANY) {
+		q_real = string_unescape_any(in, out_real, q_real);
+	} else {
+		q_real = string_unescape(in, out_real, q_real, flags);
+	}
+
+	if (q_real != q_test || memcmp(out_test, out_real, q_test)) {
+		pr_warn("Test failed: flags = %u\n", flags);
+		print_hex_dump(KERN_WARNING, "Input: ",
+			       DUMP_PREFIX_NONE, 16, 1, in, p - 1, true);
+		print_hex_dump(KERN_WARNING, "Expected: ",
+			       DUMP_PREFIX_NONE, 16, 1, out_test, q_test, true);
+		print_hex_dump(KERN_WARNING, "Got: ",
+			       DUMP_PREFIX_NONE, 16, 1, out_real, q_real, true);
+	}
+}
+
+static int __init test_string_helpers_init(void)
+{
+	unsigned int i;
+
+	pr_info("Running tests...\n");
+	for (i = 0; i < UNESCAPE_ANY + 1; i++)
+		test_string_unescape(i, false);
+	test_string_unescape(get_random_int() % (UNESCAPE_ANY + 1), true);
+
+	return -EINVAL;
+}
+module_init(test_string_helpers_init);
+MODULE_LICENSE("Dual BSD/GPL");
-- 
cgit v1.2.3


From d338b1379f96b20e63aa65082e19e9a18a93b608 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 30 Apr 2013 15:27:32 -0700
Subject: dynamic_debug: reuse generic string_unescape function

There is kernel function to do the job in generic way. Let's use it.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Jason Baron <jbaron@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/dynamic_debug.c | 48 +++++-------------------------------------------
 1 file changed, 5 insertions(+), 43 deletions(-)

(limited to 'lib')

diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 46032453abd5..99fec3ae405a 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -24,6 +24,7 @@
 #include <linux/sysctl.h>
 #include <linux/ctype.h>
 #include <linux/string.h>
+#include <linux/string_helpers.h>
 #include <linux/uaccess.h>
 #include <linux/dynamic_debug.h>
 #include <linux/debugfs.h>
@@ -276,47 +277,6 @@ static inline int parse_lineno(const char *str, unsigned int *val)
 	return 0;
 }
 
-/*
- * Undo octal escaping in a string, inplace.  This is useful to
- * allow the user to express a query which matches a format
- * containing embedded spaces.
- */
-static char *unescape(char *str)
-{
-	char *in = str;
-	char *out = str;
-
-	while (*in) {
-		if (*in == '\\') {
-			if (in[1] == '\\') {
-				*out++ = '\\';
-				in += 2;
-				continue;
-			} else if (in[1] == 't') {
-				*out++ = '\t';
-				in += 2;
-				continue;
-			} else if (in[1] == 'n') {
-				*out++ = '\n';
-				in += 2;
-				continue;
-			} else if (isodigit(in[1]) &&
-				   isodigit(in[2]) &&
-				   isodigit(in[3])) {
-				*out++ = (((in[1] - '0') << 6) |
-					  ((in[2] - '0') << 3) |
-					  (in[3] - '0'));
-				in += 4;
-				continue;
-			}
-		}
-		*out++ = *in++;
-	}
-	*out = '\0';
-
-	return str;
-}
-
 static int check_set(const char **dest, char *src, char *name)
 {
 	int rc = 0;
@@ -370,8 +330,10 @@ static int ddebug_parse_query(char *words[], int nwords,
 		} else if (!strcmp(words[i], "module")) {
 			rc = check_set(&query->module, words[i+1], "module");
 		} else if (!strcmp(words[i], "format")) {
-			rc = check_set(&query->format, unescape(words[i+1]),
-				       "format");
+			string_unescape_inplace(words[i+1], UNESCAPE_SPACE |
+							    UNESCAPE_OCTAL |
+							    UNESCAPE_SPECIAL);
+			rc = check_set(&query->format, words[i+1], "format");
 		} else if (!strcmp(words[i], "line")) {
 			char *first = words[i+1];
 			char *last = strchr(first, '-');
-- 
cgit v1.2.3


From 4130f0efbfe5adb360328b777afa8e45f7e467f7 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Tue, 30 Apr 2013 15:28:24 -0700
Subject: rbtree_test: add extra rbtree integrity check

Account for the rbtree having  2**bh(v)-1 internal nodes.

While this can be seen as a consequence of other checks, Michel states
that it nicely sums up what the other properties are for.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Reviewed-by: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree_test.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index af38aedbd874..99515038ff6d 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -117,8 +117,7 @@ static int black_path_count(struct rb_node *rb)
 static void check(int nr_nodes)
 {
 	struct rb_node *rb;
-	int count = 0;
-	int blacks = 0;
+	int count = 0, blacks = 0;
 	u32 prev_key = 0;
 
 	for (rb = rb_first(&root); rb; rb = rb_next(rb)) {
@@ -134,7 +133,9 @@ static void check(int nr_nodes)
 		prev_key = node->key;
 		count++;
 	}
+
 	WARN_ON_ONCE(count != nr_nodes);
+	WARN_ON_ONCE(count < (1 << black_path_count(rb_last(&root))) - 1);
 }
 
 static void check_augmented(int nr_nodes)
-- 
cgit v1.2.3


From c75aaa8ed03eb1312ed2990f1a716b2b9cc0df42 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Tue, 30 Apr 2013 15:28:25 -0700
Subject: rbtree_test: add __init/__exit annotations

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Reviewed-by: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree_test.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index 99515038ff6d..122f02f9941b 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -149,7 +149,7 @@ static void check_augmented(int nr_nodes)
 	}
 }
 
-static int rbtree_test_init(void)
+static int __init rbtree_test_init(void)
 {
 	int i, j;
 	cycles_t time1, time2, time;
@@ -222,7 +222,7 @@ static int rbtree_test_init(void)
 	return -EAGAIN; /* Fail will directly unload the module */
 }
 
-static void rbtree_test_exit(void)
+static void __exit rbtree_test_exit(void)
 {
 	printk(KERN_ALERT "test exit\n");
 }
-- 
cgit v1.2.3


From 446f24d1199e8a546ba7c97da3fbb9a505a94795 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Tue, 30 Apr 2013 15:28:42 -0700
Subject: Kconfig: consolidate CONFIG_DEBUG_STRICT_USER_COPY_CHECKS

The help text for this config is duplicated across the x86, parisc, and
s390 Kconfig.debug files.  Arnd Bergman noted that the help text was
slightly misleading and should be fixed to state that enabling this
option isn't a problem when using pre 4.4 gcc.

To simplify the rewording, consolidate the text into lib/Kconfig.debug
and modify it there to be more explicit about when you should say N to
this config.

Also, make the text a bit more generic by stating that this option
enables compile time checks so we can cover architectures which emit
warnings vs.  ones which emit errors.  The details of how an
architecture decided to implement the checks isn't as important as the
concept of compile time checking of copy_from_user() calls.

While we're doing this, remove all the copy_from_user_overflow() code
that's duplicated many times and place it into lib/ so that any
architecture supporting this option can get the function for free.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Helge Deller <deller@gmx.de>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Kconfig.debug | 18 ++++++++++++++++++
 lib/Makefile      |  1 +
 lib/usercopy.c    |  9 +++++++++
 3 files changed, 28 insertions(+)
 create mode 100644 lib/usercopy.c

(limited to 'lib')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 77ebaa3dfa12..770a422a42e8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1292,6 +1292,24 @@ config LATENCYTOP
 	  Enable this option if you want to use the LatencyTOP tool
 	  to find out which userspace is blocking on what kernel operations.
 
+config ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+	bool
+
+config DEBUG_STRICT_USER_COPY_CHECKS
+	bool "Strict user copy size checks"
+	depends on ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+	depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING
+	help
+	  Enabling this option turns a certain set of sanity checks for user
+	  copy operations into compile time failures.
+
+	  The copy_from_user() etc checks are there to help test if there
+	  are sufficient security checks on the length argument of
+	  the copy operation, by having gcc prove that the argument is
+	  within bounds.
+
+	  If unsure, say N.
+
 source mm/Kconfig.debug
 source kernel/trace/Kconfig
 
diff --git a/lib/Makefile b/lib/Makefile
index 23c9a0fe74fc..e9c52e1b853a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -15,6 +15,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
 	 earlycpio.o
 
+obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
 
diff --git a/lib/usercopy.c b/lib/usercopy.c
new file mode 100644
index 000000000000..4f5b1ddbcd25
--- /dev/null
+++ b/lib/usercopy.c
@@ -0,0 +1,9 @@
+#include <linux/export.h>
+#include <linux/bug.h>
+#include <linux/uaccess.h>
+
+void copy_from_user_overflow(void)
+{
+	WARN(1, "Buffer overflow detected!\n");
+}
+EXPORT_SYMBOL(copy_from_user_overflow);
-- 
cgit v1.2.3


From e12a95f40ac78f223deab743a5228f9eb3df8ed1 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 30 Apr 2013 15:28:49 -0700
Subject: notifier-error-inject: fix module names in Kconfig

The Kconfig help text for MEMORY_NOTIFIER_ERROR_INJECT and
OF_RECONFIG_NOTIFIER_ERROR_INJECT has mismatched module names.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Kconfig.debug | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 770a422a42e8..566cf2bc08ea 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1192,7 +1192,7 @@ config MEMORY_NOTIFIER_ERROR_INJECT
 	  bash: echo: write error: Cannot allocate memory
 
 	  To compile this code as a module, choose M here: the module will
-	  be called pSeries-reconfig-notifier-error-inject.
+	  be called memory-notifier-error-inject.
 
 	  If unsure, say N.
 
@@ -1209,7 +1209,7 @@ config OF_RECONFIG_NOTIFIER_ERROR_INJECT
 	  notified, write the error code to "actions/<notifier event>/error".
 
 	  To compile this code as a module, choose M here: the module will
-	  be called memory-notifier-error-inject.
+	  be called of-reconfig-notifier-error-inject.
 
 	  If unsure, say N.
 
-- 
cgit v1.2.3


From 6f9982bdde99d4e6f248613ca780c63de26ba086 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Tue, 30 Apr 2013 15:28:50 -0700
Subject: lib/decompress.c: fix initconst

Signed-off-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/decompress.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/decompress.c b/lib/decompress.c
index 31a804277282..f8fdedaf7b3d 100644
--- a/lib/decompress.c
+++ b/lib/decompress.c
@@ -38,7 +38,7 @@ struct compress_format {
 	decompress_fn decompressor;
 };
 
-static const struct compress_format compressed_formats[] __initdata = {
+static const struct compress_format compressed_formats[] __initconst = {
 	{ {037, 0213}, "gzip", gunzip },
 	{ {037, 0236}, "gzip", gunzip },
 	{ {0x42, 0x5a}, "bzip2", bunzip2 },
-- 
cgit v1.2.3


From 9e6879460c8edb0cd3c24c09b83d06541b5af0dc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sat, 4 May 2013 08:48:27 +0100
Subject: Give the OID registry file module info to avoid kernel tainting

Give the OID registry file module information so that it doesn't taint the
kernel when compiled as a module and loaded.

Reported-by: Dros Adamson <Weston.Adamson@netapp.com>
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Trond Myklebust <Trond.Myklebust@netapp.com>
cc: stable@vger.kernel.org
cc: linux-nfs@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/oid_registry.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'lib')

diff --git a/lib/oid_registry.c b/lib/oid_registry.c
index d8de11f45908..318f382a010d 100644
--- a/lib/oid_registry.c
+++ b/lib/oid_registry.c
@@ -9,6 +9,7 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 
+#include <linux/module.h>
 #include <linux/export.h>
 #include <linux/oid_registry.h>
 #include <linux/kernel.h>
@@ -16,6 +17,10 @@
 #include <linux/bug.h>
 #include "oid_registry_data.c"
 
+MODULE_DESCRIPTION("OID Registry");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
 /**
  * look_up_OID - Find an OID registration for the specified data
  * @data: Binary representation of the OID
-- 
cgit v1.2.3


From e2d57f782c8a906f80d5b5f54da803c1638929d7 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:49 -0700
Subject: rwsem: make the waiter type an enumeration rather than a bitmask

We are not planning to add some new waiter flags, so we can convert the
waiter type into an enumeration.

Background: David Howells suggested I do this back when I tried adding
a new waiter type for unfair readers. However, I believe the cleanup
applies regardless of that use case.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem-spinlock.c | 19 +++++++++++--------
 lib/rwsem.c          | 23 +++++++++++++----------
 2 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c
index 7542afbb22b3..5f117f37ac0a 100644
--- a/lib/rwsem-spinlock.c
+++ b/lib/rwsem-spinlock.c
@@ -9,12 +9,15 @@
 #include <linux/sched.h>
 #include <linux/export.h>
 
+enum rwsem_waiter_type {
+	RWSEM_WAITING_FOR_WRITE,
+	RWSEM_WAITING_FOR_READ
+};
+
 struct rwsem_waiter {
 	struct list_head list;
 	struct task_struct *task;
-	unsigned int flags;
-#define RWSEM_WAITING_FOR_READ	0x00000001
-#define RWSEM_WAITING_FOR_WRITE	0x00000002
+	enum rwsem_waiter_type type;
 };
 
 int rwsem_is_locked(struct rw_semaphore *sem)
@@ -68,7 +71,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
 
 	if (!wakewrite) {
-		if (waiter->flags & RWSEM_WAITING_FOR_WRITE)
+		if (waiter->type == RWSEM_WAITING_FOR_WRITE)
 			goto out;
 		goto dont_wake_writers;
 	}
@@ -78,7 +81,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 	 * to -1 here to indicate we get the lock. Instead, we wake it up
 	 * to let it go get it again.
 	 */
-	if (waiter->flags & RWSEM_WAITING_FOR_WRITE) {
+	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
 		wake_up_process(waiter->task);
 		goto out;
 	}
@@ -86,7 +89,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 	/* grant an infinite number of read locks to the front of the queue */
  dont_wake_writers:
 	woken = 0;
-	while (waiter->flags & RWSEM_WAITING_FOR_READ) {
+	while (waiter->type == RWSEM_WAITING_FOR_READ) {
 		struct list_head *next = waiter->list.next;
 
 		list_del(&waiter->list);
@@ -144,7 +147,7 @@ void __sched __down_read(struct rw_semaphore *sem)
 
 	/* set up my own style of waitqueue */
 	waiter.task = tsk;
-	waiter.flags = RWSEM_WAITING_FOR_READ;
+	waiter.type = RWSEM_WAITING_FOR_READ;
 	get_task_struct(tsk);
 
 	list_add_tail(&waiter.list, &sem->wait_list);
@@ -201,7 +204,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
 	/* set up my own style of waitqueue */
 	tsk = current;
 	waiter.task = tsk;
-	waiter.flags = RWSEM_WAITING_FOR_WRITE;
+	waiter.type = RWSEM_WAITING_FOR_WRITE;
 	list_add_tail(&waiter.list, &sem->wait_list);
 
 	/* wait for someone to release the lock */
diff --git a/lib/rwsem.c b/lib/rwsem.c
index ad5e0df16ab4..672eb33218ac 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -30,12 +30,15 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
 
 EXPORT_SYMBOL(__init_rwsem);
 
+enum rwsem_waiter_type {
+	RWSEM_WAITING_FOR_WRITE,
+	RWSEM_WAITING_FOR_READ
+};
+
 struct rwsem_waiter {
 	struct list_head list;
 	struct task_struct *task;
-	unsigned int flags;
-#define RWSEM_WAITING_FOR_READ	0x00000001
-#define RWSEM_WAITING_FOR_WRITE	0x00000002
+	enum rwsem_waiter_type type;
 };
 
 /* Wake types for __rwsem_do_wake().  Note that RWSEM_WAKE_NO_ACTIVE and
@@ -65,7 +68,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 	signed long woken, loop, adjustment;
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-	if (!(waiter->flags & RWSEM_WAITING_FOR_WRITE))
+	if (waiter->type != RWSEM_WAITING_FOR_WRITE)
 		goto readers_only;
 
 	if (wake_type == RWSEM_WAKE_READ_OWNED)
@@ -112,10 +115,10 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 		waiter = list_entry(waiter->list.next,
 					struct rwsem_waiter, list);
 
-	} while (waiter->flags & RWSEM_WAITING_FOR_READ);
+	} while (waiter->type != RWSEM_WAITING_FOR_WRITE);
 
 	adjustment = woken * RWSEM_ACTIVE_READ_BIAS;
-	if (waiter->flags & RWSEM_WAITING_FOR_READ)
+	if (waiter->type != RWSEM_WAITING_FOR_WRITE)
 		/* hit end of list above */
 		adjustment -= RWSEM_WAITING_BIAS;
 
@@ -148,7 +151,7 @@ static int try_get_writer_sem(struct rw_semaphore *sem,
 
 	/* only steal when first waiter is writing */
 	fwaiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-	if (!(fwaiter->flags & RWSEM_WAITING_FOR_WRITE))
+	if (fwaiter->type != RWSEM_WAITING_FOR_WRITE)
 		return 0;
 
 	adjustment = RWSEM_ACTIVE_WRITE_BIAS;
@@ -179,7 +182,7 @@ try_again_write:
  */
 static struct rw_semaphore __sched *
 rwsem_down_failed_common(struct rw_semaphore *sem,
-			 unsigned int flags, signed long adjustment)
+			 enum rwsem_waiter_type type, signed long adjustment)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk = current;
@@ -190,7 +193,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 	/* set up my own style of waitqueue */
 	raw_spin_lock_irq(&sem->wait_lock);
 	waiter.task = tsk;
-	waiter.flags = flags;
+	waiter.type = type;
 	get_task_struct(tsk);
 
 	if (list_empty(&sem->wait_list))
@@ -221,7 +224,7 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 
 		raw_spin_lock_irq(&sem->wait_lock);
 		/* Try to get the writer sem, may steal from the head writer: */
-		if (flags == RWSEM_WAITING_FOR_WRITE)
+		if (type == RWSEM_WAITING_FOR_WRITE)
 			if (try_get_writer_sem(sem, &waiter)) {
 				raw_spin_unlock_irq(&sem->wait_lock);
 				return sem;
-- 
cgit v1.2.3


From f7dd1cee9a4e2b1450e4a3732636dfbf28562ee4 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:50 -0700
Subject: rwsem: shorter spinlocked section in rwsem_down_failed_common()

This change reduces the size of the spinlocked and TASK_UNINTERRUPTIBLE
sections in rwsem_down_failed_common():

- We only need the sem->wait_lock to insert ourselves on the wait_list;
  the waiter node can be prepared outside of the wait_lock.

- The task state only needs to be set to TASK_UNINTERRUPTIBLE immediately
  before checking if we actually need to sleep; it doesn't need to protect
  the entire function.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 672eb33218ac..40636454cf3c 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -188,14 +188,12 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 	struct task_struct *tsk = current;
 	signed long count;
 
-	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-
 	/* set up my own style of waitqueue */
-	raw_spin_lock_irq(&sem->wait_lock);
 	waiter.task = tsk;
 	waiter.type = type;
 	get_task_struct(tsk);
 
+	raw_spin_lock_irq(&sem->wait_lock);
 	if (list_empty(&sem->wait_list))
 		adjustment += RWSEM_WAITING_BIAS;
 	list_add_tail(&waiter.list, &sem->wait_list);
@@ -218,7 +216,8 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 	raw_spin_unlock_irq(&sem->wait_lock);
 
 	/* wait to be given the lock */
-	for (;;) {
+	while (true) {
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (!waiter.task)
 			break;
 
@@ -231,7 +230,6 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 			}
 		raw_spin_unlock_irq(&sem->wait_lock);
 		schedule();
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 	}
 
 	tsk->state = TASK_RUNNING;
-- 
cgit v1.2.3


From 1e78277ccbbb48af32a618d1ef0e8534e0b648d7 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:51 -0700
Subject: rwsem: move rwsem_down_failed_common code into
 rwsem_down_{read,write}_failed

Remove the rwsem_down_failed_common function and replace it with two
identical copies of its code in rwsem_down_{read,write}_failed.

This is because we want to make different optimizations in
rwsem_down_{read,write}_failed; we are adding this pure-duplication
step as a separate commit in order to make it easier to check the
following steps.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 57 insertions(+), 15 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 40636454cf3c..fb658af1c12c 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -178,12 +178,12 @@ try_again_write:
 }
 
 /*
- * wait for a lock to be granted
+ * wait for the read lock to be granted
  */
-static struct rw_semaphore __sched *
-rwsem_down_failed_common(struct rw_semaphore *sem,
-			 enum rwsem_waiter_type type, signed long adjustment)
+struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 {
+	enum rwsem_waiter_type type = RWSEM_WAITING_FOR_READ;
+	signed long adjustment = -RWSEM_ACTIVE_READ_BIAS;
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk = current;
 	signed long count;
@@ -237,22 +237,64 @@ rwsem_down_failed_common(struct rw_semaphore *sem,
 	return sem;
 }
 
-/*
- * wait for the read lock to be granted
- */
-struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
-{
-	return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_READ,
-					-RWSEM_ACTIVE_READ_BIAS);
-}
-
 /*
  * wait for the write lock to be granted
  */
 struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 {
-	return rwsem_down_failed_common(sem, RWSEM_WAITING_FOR_WRITE,
-					-RWSEM_ACTIVE_WRITE_BIAS);
+	enum rwsem_waiter_type type = RWSEM_WAITING_FOR_WRITE;
+	signed long adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
+	struct rwsem_waiter waiter;
+	struct task_struct *tsk = current;
+	signed long count;
+
+	/* set up my own style of waitqueue */
+	waiter.task = tsk;
+	waiter.type = type;
+	get_task_struct(tsk);
+
+	raw_spin_lock_irq(&sem->wait_lock);
+	if (list_empty(&sem->wait_list))
+		adjustment += RWSEM_WAITING_BIAS;
+	list_add_tail(&waiter.list, &sem->wait_list);
+
+	/* we're now waiting on the lock, but no longer actively locking */
+	count = rwsem_atomic_update(adjustment, sem);
+
+	/* If there are no active locks, wake the front queued process(es) up.
+	 *
+	 * Alternatively, if we're called from a failed down_write(), there
+	 * were already threads queued before us and there are no active
+	 * writers, the lock must be read owned; so we try to wake any read
+	 * locks that were queued ahead of us. */
+	if (count == RWSEM_WAITING_BIAS)
+		sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE);
+	else if (count > RWSEM_WAITING_BIAS &&
+		 adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
+		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
+
+	raw_spin_unlock_irq(&sem->wait_lock);
+
+	/* wait to be given the lock */
+	while (true) {
+		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		if (!waiter.task)
+			break;
+
+		raw_spin_lock_irq(&sem->wait_lock);
+		/* Try to get the writer sem, may steal from the head writer: */
+		if (type == RWSEM_WAITING_FOR_WRITE)
+			if (try_get_writer_sem(sem, &waiter)) {
+				raw_spin_unlock_irq(&sem->wait_lock);
+				return sem;
+			}
+		raw_spin_unlock_irq(&sem->wait_lock);
+		schedule();
+	}
+
+	tsk->state = TASK_RUNNING;
+
+	return sem;
 }
 
 /*
-- 
cgit v1.2.3


From da16922cc031b9c0221c836994276ab193b31de8 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:52 -0700
Subject: rwsem: simplify rwsem_down_read_failed

When trying to acquire a read lock, the RWSEM_ACTIVE_READ_BIAS
adjustment doesn't cause other readers to block, so we never have to
worry about waking them back after canceling this adjustment in
rwsem_down_read_failed().

We also never want to steal the lock in rwsem_down_read_failed(), so we
don't have to grab the wait_lock either.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index fb658af1c12c..66f307e90761 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -182,7 +182,6 @@ try_again_write:
  */
 struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 {
-	enum rwsem_waiter_type type = RWSEM_WAITING_FOR_READ;
 	signed long adjustment = -RWSEM_ACTIVE_READ_BIAS;
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk = current;
@@ -190,7 +189,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 
 	/* set up my own style of waitqueue */
 	waiter.task = tsk;
-	waiter.type = type;
+	waiter.type = RWSEM_WAITING_FOR_READ;
 	get_task_struct(tsk);
 
 	raw_spin_lock_irq(&sem->wait_lock);
@@ -201,17 +200,9 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 	/* we're now waiting on the lock, but no longer actively locking */
 	count = rwsem_atomic_update(adjustment, sem);
 
-	/* If there are no active locks, wake the front queued process(es) up.
-	 *
-	 * Alternatively, if we're called from a failed down_write(), there
-	 * were already threads queued before us and there are no active
-	 * writers, the lock must be read owned; so we try to wake any read
-	 * locks that were queued ahead of us. */
+	/* If there are no active locks, wake the front queued process(es). */
 	if (count == RWSEM_WAITING_BIAS)
 		sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE);
-	else if (count > RWSEM_WAITING_BIAS &&
-		 adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
-		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
 
 	raw_spin_unlock_irq(&sem->wait_lock);
 
@@ -220,15 +211,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		if (!waiter.task)
 			break;
-
-		raw_spin_lock_irq(&sem->wait_lock);
-		/* Try to get the writer sem, may steal from the head writer: */
-		if (type == RWSEM_WAITING_FOR_WRITE)
-			if (try_get_writer_sem(sem, &waiter)) {
-				raw_spin_unlock_irq(&sem->wait_lock);
-				return sem;
-			}
-		raw_spin_unlock_irq(&sem->wait_lock);
 		schedule();
 	}
 
-- 
cgit v1.2.3


From 023fe4f712028d25b42d31984abae1f3d3f0e3e2 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:53 -0700
Subject: rwsem: simplify rwsem_down_write_failed

When waking writers, we never grant them the lock - instead, they have
to acquire it themselves when they run, and remove themselves from the
wait_list when they succeed.

As a result, we can do a few simplifications in rwsem_down_write_failed():

- We don't need to check for !waiter.task since __rwsem_do_wake() doesn't
  remove writers from the wait_list

- There is no point releaseing the wait_lock before entering the wait loop,
  as we will need to reacquire it immediately. We can change the loop so
  that the lock is always held at the start of each loop iteration.

- We don't need to get a reference on the task structure, since the task
  is responsible for removing itself from the wait_list. There is no risk,
  like in the rwsem_down_read_failed() case, that a task would wake up and
  exit (thus destroying its task structure) while __rwsem_do_wake() is
  still running - wait_lock protects against that.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 33 +++++++++------------------------
 1 file changed, 9 insertions(+), 24 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 66f307e90761..c73bd96dc30c 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -161,16 +161,8 @@ static int try_get_writer_sem(struct rw_semaphore *sem,
 
 try_again_write:
 	oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
-	if (!(oldcount & RWSEM_ACTIVE_MASK)) {
-		/* No active lock: */
-		struct task_struct *tsk = waiter->task;
-
-		list_del(&waiter->list);
-		smp_mb();
-		put_task_struct(tsk);
-		tsk->state = TASK_RUNNING;
+	if (!(oldcount & RWSEM_ACTIVE_MASK))
 		return 1;
-	}
 	/* some one grabbed the sem already */
 	if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK)
 		return 0;
@@ -220,11 +212,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 }
 
 /*
- * wait for the write lock to be granted
+ * wait until we successfully acquire the write lock
  */
 struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 {
-	enum rwsem_waiter_type type = RWSEM_WAITING_FOR_WRITE;
 	signed long adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk = current;
@@ -232,8 +223,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 
 	/* set up my own style of waitqueue */
 	waiter.task = tsk;
-	waiter.type = type;
-	get_task_struct(tsk);
+	waiter.type = RWSEM_WAITING_FOR_WRITE;
 
 	raw_spin_lock_irq(&sem->wait_lock);
 	if (list_empty(&sem->wait_list))
@@ -255,25 +245,20 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 		 adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
 		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
 
-	raw_spin_unlock_irq(&sem->wait_lock);
-
-	/* wait to be given the lock */
+	/* wait until we successfully acquire the lock */
 	while (true) {
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-		if (!waiter.task)
+
+		if (try_get_writer_sem(sem, &waiter))
 			break;
 
-		raw_spin_lock_irq(&sem->wait_lock);
-		/* Try to get the writer sem, may steal from the head writer: */
-		if (type == RWSEM_WAITING_FOR_WRITE)
-			if (try_get_writer_sem(sem, &waiter)) {
-				raw_spin_unlock_irq(&sem->wait_lock);
-				return sem;
-			}
 		raw_spin_unlock_irq(&sem->wait_lock);
 		schedule();
+		raw_spin_lock_irq(&sem->wait_lock);
 	}
 
+	list_del(&waiter.list);
+	raw_spin_unlock_irq(&sem->wait_lock);
 	tsk->state = TASK_RUNNING;
 
 	return sem;
-- 
cgit v1.2.3


From ed00f64346631dff035adfb9b0240daaa8b46c4e Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:54 -0700
Subject: rwsem: more agressive lock stealing in rwsem_down_write_failed

Some small code simplifications can be achieved by doing more agressive
lock stealing:

- When rwsem_down_write_failed() notices that there are no active locks
  (and thus no thread to wake us if we decided to sleep), it used to wake
  the first queued process. However, stealing the lock is also sufficient
  to deal with this case, so we don't need this check anymore.

- In try_get_writer_sem(), we can steal the lock even when the first waiter
  is a reader. This is correct because the code path that wakes readers is
  protected by the wait_lock. As to the performance effects of this change,
  they are expected to be minimal: readers are still granted the lock
  (rather than having to acquire it themselves) when they reach the front
  of the wait queue, so we have essentially the same behavior as in
  rwsem-spinlock.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 29 ++++++++---------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index c73bd96dc30c..2360bf204098 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -143,20 +143,12 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 }
 
 /* Try to get write sem, caller holds sem->wait_lock: */
-static int try_get_writer_sem(struct rw_semaphore *sem,
-					struct rwsem_waiter *waiter)
+static int try_get_writer_sem(struct rw_semaphore *sem)
 {
-	struct rwsem_waiter *fwaiter;
 	long oldcount, adjustment;
 
-	/* only steal when first waiter is writing */
-	fwaiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-	if (fwaiter->type != RWSEM_WAITING_FOR_WRITE)
-		return 0;
-
 	adjustment = RWSEM_ACTIVE_WRITE_BIAS;
-	/* Only one waiter in the queue: */
-	if (fwaiter == waiter && waiter->list.next == &sem->wait_list)
+	if (list_is_singular(&sem->wait_list))
 		adjustment -= RWSEM_WAITING_BIAS;
 
 try_again_write:
@@ -233,23 +225,18 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 	/* we're now waiting on the lock, but no longer actively locking */
 	count = rwsem_atomic_update(adjustment, sem);
 
-	/* If there are no active locks, wake the front queued process(es) up.
-	 *
-	 * Alternatively, if we're called from a failed down_write(), there
-	 * were already threads queued before us and there are no active
-	 * writers, the lock must be read owned; so we try to wake any read
-	 * locks that were queued ahead of us. */
-	if (count == RWSEM_WAITING_BIAS)
-		sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE);
-	else if (count > RWSEM_WAITING_BIAS &&
-		 adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
+	/* If there were already threads queued before us and there are no
+	 * active writers, the lock must be read owned; so we try to wake
+	 * any read locks that were queued ahead of us. */
+	if (count > RWSEM_WAITING_BIAS &&
+	    adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
 		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
 
 	/* wait until we successfully acquire the lock */
 	while (true) {
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 
-		if (try_get_writer_sem(sem, &waiter))
+		if (try_get_writer_sem(sem))
 			break;
 
 		raw_spin_unlock_irq(&sem->wait_lock);
-- 
cgit v1.2.3


From 5ede972df1cd9294c82e9515949fd2103be81d7b Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:55 -0700
Subject: rwsem: use cmpxchg for trying to steal write lock

Using rwsem_atomic_update to try stealing the write lock forced us to
undo the adjustment in the failure path.  We can have simpler and faster
code by using cmpxchg instead.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 2360bf204098..64c2dc007be2 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -142,25 +142,6 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 	return sem;
 }
 
-/* Try to get write sem, caller holds sem->wait_lock: */
-static int try_get_writer_sem(struct rw_semaphore *sem)
-{
-	long oldcount, adjustment;
-
-	adjustment = RWSEM_ACTIVE_WRITE_BIAS;
-	if (list_is_singular(&sem->wait_list))
-		adjustment -= RWSEM_WAITING_BIAS;
-
-try_again_write:
-	oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
-	if (!(oldcount & RWSEM_ACTIVE_MASK))
-		return 1;
-	/* some one grabbed the sem already */
-	if (rwsem_atomic_update(-adjustment, sem) & RWSEM_ACTIVE_MASK)
-		return 0;
-	goto try_again_write;
-}
-
 /*
  * wait for the read lock to be granted
  */
@@ -236,7 +217,12 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 	while (true) {
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 
-		if (try_get_writer_sem(sem))
+		/* Try acquiring the write lock. */
+		count = RWSEM_ACTIVE_WRITE_BIAS;
+		if (!list_is_singular(&sem->wait_list))
+			count += RWSEM_WAITING_BIAS;
+		if (cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
+							RWSEM_WAITING_BIAS)
 			break;
 
 		raw_spin_unlock_irq(&sem->wait_lock);
-- 
cgit v1.2.3


From a7d2c573ae7fad1b2c877d1a1342fa5bb0d6478c Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:56 -0700
Subject: rwsem: avoid taking wait_lock in rwsem_down_write_failed

In rwsem_down_write_failed(), if there are active locks after we wake up
(i.e.  the lock got stolen from us), skip taking the wait_lock and go
back to sleep immediately.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 64c2dc007be2..edf3d9ca670e 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -214,8 +214,8 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
 
 	/* wait until we successfully acquire the lock */
+	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 	while (true) {
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 
 		/* Try acquiring the write lock. */
 		count = RWSEM_ACTIVE_WRITE_BIAS;
@@ -226,7 +226,13 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 			break;
 
 		raw_spin_unlock_irq(&sem->wait_lock);
-		schedule();
+
+		/* Block until there are no active lockers. */
+		do {
+			schedule();
+			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		} while (sem->count & RWSEM_ACTIVE_MASK);
+
 		raw_spin_lock_irq(&sem->wait_lock);
 	}
 
-- 
cgit v1.2.3


From 9b0fc9c09f1b262b7fe697eba6b05095d78850e5 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:57 -0700
Subject: rwsem: skip initial trylock in rwsem_down_write_failed

We can skip the initial trylock in rwsem_down_write_failed() if there
are known active lockers already, thus saving one likely-to-fail
cmpxchg.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index edf3d9ca670e..0d50e46d5b0c 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -216,14 +216,15 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 	/* wait until we successfully acquire the lock */
 	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 	while (true) {
-
-		/* Try acquiring the write lock. */
-		count = RWSEM_ACTIVE_WRITE_BIAS;
-		if (!list_is_singular(&sem->wait_list))
-			count += RWSEM_WAITING_BIAS;
-		if (cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
+		if (!(count & RWSEM_ACTIVE_MASK)) {
+			/* Try acquiring the write lock. */
+			count = RWSEM_ACTIVE_WRITE_BIAS;
+			if (!list_is_singular(&sem->wait_list))
+				count += RWSEM_WAITING_BIAS;
+			if (cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
 							RWSEM_WAITING_BIAS)
-			break;
+				break;
+		}
 
 		raw_spin_unlock_irq(&sem->wait_lock);
 
@@ -231,7 +232,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 		do {
 			schedule();
 			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-		} while (sem->count & RWSEM_ACTIVE_MASK);
+		} while ((count = sem->count) & RWSEM_ACTIVE_MASK);
 
 		raw_spin_lock_irq(&sem->wait_lock);
 	}
-- 
cgit v1.2.3


From 8cf5322ce69afea1fab6a6270db24d057d664798 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:58 -0700
Subject: rwsem: simplify __rwsem_do_wake

This is mostly for cleanup value:

- We don't need several gotos to handle the case where the first
  waiter is a writer. Two simple tests will do (and generate very
  similar code).

- In the remainder of the function, we know the first waiter is a reader,
  so we don't have to double check that. We can use do..while loops
  to iterate over the readers to wake (generates slightly better code).

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem-spinlock.c | 23 +++++++----------------
 lib/rwsem.c          | 26 ++++++++++++--------------
 2 files changed, 19 insertions(+), 30 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem-spinlock.c b/lib/rwsem-spinlock.c
index 5f117f37ac0a..9be8a9144978 100644
--- a/lib/rwsem-spinlock.c
+++ b/lib/rwsem-spinlock.c
@@ -70,26 +70,17 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
 
-	if (!wakewrite) {
-		if (waiter->type == RWSEM_WAITING_FOR_WRITE)
-			goto out;
-		goto dont_wake_writers;
-	}
-
-	/*
-	 * as we support write lock stealing, we can't set sem->activity
-	 * to -1 here to indicate we get the lock. Instead, we wake it up
-	 * to let it go get it again.
-	 */
 	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
-		wake_up_process(waiter->task);
+		if (wakewrite)
+			/* Wake up a writer. Note that we do not grant it the
+			 * lock - it will have to acquire it when it runs. */
+			wake_up_process(waiter->task);
 		goto out;
 	}
 
 	/* grant an infinite number of read locks to the front of the queue */
- dont_wake_writers:
 	woken = 0;
-	while (waiter->type == RWSEM_WAITING_FOR_READ) {
+	do {
 		struct list_head *next = waiter->list.next;
 
 		list_del(&waiter->list);
@@ -99,10 +90,10 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 		wake_up_process(tsk);
 		put_task_struct(tsk);
 		woken++;
-		if (list_empty(&sem->wait_list))
+		if (next == &sem->wait_list)
 			break;
 		waiter = list_entry(next, struct rwsem_waiter, list);
-	}
+	} while (waiter->type != RWSEM_WAITING_FOR_WRITE);
 
 	sem->activity += woken;
 
diff --git a/lib/rwsem.c b/lib/rwsem.c
index 0d50e46d5b0c..9a675fa9d78e 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -68,20 +68,17 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 	signed long woken, loop, adjustment;
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-	if (waiter->type != RWSEM_WAITING_FOR_WRITE)
-		goto readers_only;
-
-	if (wake_type == RWSEM_WAKE_READ_OWNED)
-		/* Another active reader was observed, so wakeup is not
-		 * likely to succeed. Save the atomic op.
-		 */
+	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
+		if (wake_type != RWSEM_WAKE_READ_OWNED)
+			/* Wake writer at the front of the queue, but do not
+			 * grant it the lock yet as we want other writers
+			 * to be able to steal it.  Readers, on the other hand,
+			 * will block as they will notice the queued writer.
+			 */
+			wake_up_process(waiter->task);
 		goto out;
+	}
 
-	/* Wake up the writing waiter and let the task grab the sem: */
-	wake_up_process(waiter->task);
-	goto out;
-
- readers_only:
 	/* If we come here from up_xxxx(), another thread might have reached
 	 * rwsem_down_failed_common() before we acquired the spinlock and
 	 * woken up a waiter, making it now active.  We prefer to check for
@@ -125,7 +122,8 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 	rwsem_atomic_add(adjustment, sem);
 
 	next = sem->wait_list.next;
-	for (loop = woken; loop > 0; loop--) {
+	loop = woken;
+	do {
 		waiter = list_entry(next, struct rwsem_waiter, list);
 		next = waiter->list.next;
 		tsk = waiter->task;
@@ -133,7 +131,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 		waiter->task = NULL;
 		wake_up_process(tsk);
 		put_task_struct(tsk);
-	}
+	} while (--loop);
 
 	sem->wait_list.next = next;
 	next->prev = &sem->wait_list;
-- 
cgit v1.2.3


From fe6e674c6187d4f452a679ced7e95262bd517936 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:45:59 -0700
Subject: rwsem: implement support for write lock stealing on the fastpath

When we decide to wake up readers, we must first grant them as many read
locks as necessary, and then actually wake up all these readers.  But in
order to know how many read shares to grant, we must first count the
readers at the head of the queue.  This might take a while if there are
many readers, and we want to be protected against a writer stealing the
lock while we're counting.  To that end, we grant the first reader lock
before counting how many more readers are queued.

We also require some adjustments to the wake_type semantics.

RWSEM_WAKE_NO_ACTIVE used to mean that we had found the count to be
RWSEM_WAITING_BIAS, in which case the rwsem was known to be free as
nobody could steal it while we hold the wait_lock.  This doesn't make
sense once we implement fastpath write lock stealing, so we now use
RWSEM_WAKE_ANY in that case.

Similarly, when rwsem_down_write_failed found that a read lock was
active, it would use RWSEM_WAKE_READ_OWNED which signalled that new
readers could be woken without checking first that the rwsem was
available.  We can't do that anymore since the existing readers might
release their read locks, and a writer could steal the lock before we
wake up additional readers.  So, we have to use a new RWSEM_WAKE_READERS
value to indicate we only want to wake readers, but we don't currently
hold any read lock.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 64 ++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 9a675fa9d78e..bbe48c04f363 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -4,6 +4,7 @@
  * Derived from arch/i386/kernel/semaphore.c
  *
  * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
+ * and Michel Lespinasse <walken@google.com>
  */
 #include <linux/rwsem.h>
 #include <linux/sched.h>
@@ -41,13 +42,11 @@ struct rwsem_waiter {
 	enum rwsem_waiter_type type;
 };
 
-/* Wake types for __rwsem_do_wake().  Note that RWSEM_WAKE_NO_ACTIVE and
- * RWSEM_WAKE_READ_OWNED imply that the spinlock must have been kept held
- * since the rwsem value was observed.
- */
-#define RWSEM_WAKE_ANY        0 /* Wake whatever's at head of wait list */
-#define RWSEM_WAKE_NO_ACTIVE  1 /* rwsem was observed with no active thread */
-#define RWSEM_WAKE_READ_OWNED 2 /* rwsem was observed to be read owned */
+enum rwsem_wake_type {
+	RWSEM_WAKE_ANY,		/* Wake whatever's at head of wait list */
+	RWSEM_WAKE_READERS,	/* Wake readers only */
+	RWSEM_WAKE_READ_OWNED	/* Waker thread holds the read lock */
+};
 
 /*
  * handle the lock release when processes blocked on it that can now run
@@ -60,16 +59,16 @@ struct rwsem_waiter {
  * - writers are only woken if downgrading is false
  */
 static struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
+__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
 {
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
 	struct list_head *next;
-	signed long woken, loop, adjustment;
+	signed long oldcount, woken, loop, adjustment;
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
 	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
-		if (wake_type != RWSEM_WAKE_READ_OWNED)
+		if (wake_type == RWSEM_WAKE_ANY)
 			/* Wake writer at the front of the queue, but do not
 			 * grant it the lock yet as we want other writers
 			 * to be able to steal it.  Readers, on the other hand,
@@ -79,24 +78,24 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 		goto out;
 	}
 
-	/* If we come here from up_xxxx(), another thread might have reached
-	 * rwsem_down_failed_common() before we acquired the spinlock and
-	 * woken up a waiter, making it now active.  We prefer to check for
-	 * this first in order to not spend too much time with the spinlock
-	 * held if we're not going to be able to wake up readers in the end.
-	 *
-	 * Note that we do not need to update the rwsem count: any writer
-	 * trying to acquire rwsem will run rwsem_down_write_failed() due
-	 * to the waiting threads and block trying to acquire the spinlock.
-	 *
-	 * We use a dummy atomic update in order to acquire the cache line
-	 * exclusively since we expect to succeed and run the final rwsem
-	 * count adjustment pretty soon.
+	/* Writers might steal the lock before we grant it to the next reader.
+	 * We prefer to do the first reader grant before counting readers
+	 * so we can bail out early if a writer stole the lock.
 	 */
-	if (wake_type == RWSEM_WAKE_ANY &&
-	    rwsem_atomic_update(0, sem) < RWSEM_WAITING_BIAS)
-		/* Someone grabbed the sem for write already */
-		goto out;
+	adjustment = 0;
+	if (wake_type != RWSEM_WAKE_READ_OWNED) {
+		adjustment = RWSEM_ACTIVE_READ_BIAS;
+ try_reader_grant:
+		oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+		if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
+			/* A writer stole the lock. Undo our reader grant. */
+			if (rwsem_atomic_update(-adjustment, sem) &
+						RWSEM_ACTIVE_MASK)
+				goto out;
+			/* Last active locker left. Retry waking readers. */
+			goto try_reader_grant;
+		}
+	}
 
 	/* Grant an infinite number of read locks to the readers at the front
 	 * of the queue.  Note we increment the 'active part' of the count by
@@ -114,12 +113,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wake_type)
 
 	} while (waiter->type != RWSEM_WAITING_FOR_WRITE);
 
-	adjustment = woken * RWSEM_ACTIVE_READ_BIAS;
+	adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
 	if (waiter->type != RWSEM_WAITING_FOR_WRITE)
 		/* hit end of list above */
 		adjustment -= RWSEM_WAITING_BIAS;
 
-	rwsem_atomic_add(adjustment, sem);
+	if (adjustment)
+		rwsem_atomic_add(adjustment, sem);
 
 	next = sem->wait_list.next;
 	loop = woken;
@@ -164,8 +164,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 	count = rwsem_atomic_update(adjustment, sem);
 
 	/* If there are no active locks, wake the front queued process(es). */
-	if (count == RWSEM_WAITING_BIAS)
-		sem = __rwsem_do_wake(sem, RWSEM_WAKE_NO_ACTIVE);
+	if (!(count & RWSEM_ACTIVE_MASK))
+		sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
 
 	raw_spin_unlock_irq(&sem->wait_lock);
 
@@ -209,7 +209,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 	 * any read locks that were queued ahead of us. */
 	if (count > RWSEM_WAITING_BIAS &&
 	    adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
-		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
+		sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
 
 	/* wait until we successfully acquire the lock */
 	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-- 
cgit v1.2.3


From 25c39325968bbcebe6cd2a1991228c9dfb48d655 Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <walken@google.com>
Date: Tue, 7 May 2013 06:46:00 -0700
Subject: rwsem: do not block readers at head of queue if other readers are
 active

This change fixes a race condition where a reader might determine it
needs to block, but by the time it acquires the wait_lock the rwsem has
active readers and no queued waiters.

In this situation the reader can run in parallel with the existing
active readers; it does not need to block until the active readers
complete.

Thanks to Peter Hurley for noticing this possible race.

Signed-off-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Peter Hurley <peter@hurleysoftware.com>
Acked-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index bbe48c04f363..61f91ca75e40 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -163,8 +163,14 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 	/* we're now waiting on the lock, but no longer actively locking */
 	count = rwsem_atomic_update(adjustment, sem);
 
-	/* If there are no active locks, wake the front queued process(es). */
-	if (!(count & RWSEM_ACTIVE_MASK))
+	/* If there are no active locks, wake the front queued process(es).
+	 *
+	 * If there are no writers and we are first in the queue,
+	 * wake our own waiter to join the existing active readers !
+	 */
+	if (count == RWSEM_WAITING_BIAS ||
+	    (count > RWSEM_WAITING_BIAS &&
+	     adjustment != -RWSEM_ACTIVE_READ_BIAS))
 		sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
 
 	raw_spin_unlock_irq(&sem->wait_lock);
-- 
cgit v1.2.3


From b5f541810ea9fb98d93c0ee0e00e07a22874856f Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Tue, 7 May 2013 06:46:02 -0700
Subject: rwsem: no need for explicit signed longs

Change explicit "signed long" declarations into plain "long" as suggested
by Peter Hurley.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Reviewed-by: Michel Lespinasse <walken@google.com>
Signed-off-by: Michel Lespinasse <walken@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index 61f91ca75e40..cf0ad2ad19f5 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -64,7 +64,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
 	struct list_head *next;
-	signed long oldcount, woken, loop, adjustment;
+	long oldcount, woken, loop, adjustment;
 
 	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
 	if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
@@ -145,10 +145,9 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
  */
 struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 {
-	signed long adjustment = -RWSEM_ACTIVE_READ_BIAS;
+	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk = current;
-	signed long count;
 
 	/* set up my own style of waitqueue */
 	waiter.task = tsk;
@@ -193,10 +192,9 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
  */
 struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 {
-	signed long adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
+	long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk = current;
-	signed long count;
 
 	/* set up my own style of waitqueue */
 	waiter.task = tsk;
-- 
cgit v1.2.3


From 2d864e41710f1d2ba406fb62018ab0487152e6f2 Mon Sep 17 00:00:00 2001
From: Anatol Pomozov <anatol.pomozov@gmail.com>
Date: Tue, 7 May 2013 15:37:48 -0700
Subject: kref: minor cleanup

 - make warning smp-safe
 - result of atomic _unless_zero functions should be checked by caller
   to avoid use-after-free error
 - trivial whitespace fix.

Link: https://lkml.org/lkml/2013/4/12/391

Tested: compile x86, boot machine and run xfstests
Signed-off-by: Anatol Pomozov <anatol.pomozov@gmail.com>
[ Removed line-break, changed to use WARN_ON_ONCE()  - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/kobject.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/kobject.c b/lib/kobject.c
index a65486613d79..b7e29a6056d3 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -529,7 +529,7 @@ struct kobject *kobject_get(struct kobject *kobj)
 	return kobj;
 }
 
-static struct kobject *kobject_get_unless_zero(struct kobject *kobj)
+static struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
 {
 	if (!kref_get_unless_zero(&kobj->kref))
 		kobj = NULL;
-- 
cgit v1.2.3


From 9607a85b67a9714290a78c1a56630ab1c9fa2c23 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Tue, 7 May 2013 15:39:03 -0700
Subject: rwsem: check counter to avoid cmpxchg calls

This patch tries to reduce the amount of cmpxchg calls in the writer
failed path by checking the counter value first before issuing the
instruction.  If ->count is not set to RWSEM_WAITING_BIAS then there is
no point wasting a cmpxchg call.

Furthermore, Michel states "I suppose it helps due to the case where
someone else steals the lock while we're trying to acquire
sem->wait_lock."

Two very different workloads and machines were used to see how this
patch improves throughput: pgbench on a quad-core laptop and aim7 on a
large 8 socket box with 80 cores.

Some results comparing Michel's fast-path write lock stealing
(tps-rwsem) on a quad-core laptop running pgbench:

  | db_size | clients  |  tps-rwsem     |   tps-patch  |
  +---------+----------+----------------+--------------+
  | 160 MB   |       1 |           6906 |         9153 | + 32.5
  | 160 MB   |       2 |          15931 |        22487 | + 41.1%
  | 160 MB   |       4 |          33021 |        32503 |
  | 160 MB   |       8 |          34626 |        34695 |
  | 160 MB   |      16 |          33098 |        34003 |
  | 160 MB   |      20 |          31343 |        31440 |
  | 160 MB   |      30 |          28961 |        28987 |
  | 160 MB   |      40 |          26902 |        26970 |
  | 160 MB   |      50 |          25760 |        25810 |
  ------------------------------------------------------
  | 1.6 GB   |       1 |           7729 |         7537 |
  | 1.6 GB   |       2 |          19009 |        23508 | + 23.7%
  | 1.6 GB   |       4 |          33185 |        32666 |
  | 1.6 GB   |       8 |          34550 |        34318 |
  | 1.6 GB   |      16 |          33079 |        32689 |
  | 1.6 GB   |      20 |          31494 |        31702 |
  | 1.6 GB   |      30 |          28535 |        28755 |
  | 1.6 GB   |      40 |          27054 |        27017 |
  | 1.6 GB   |      50 |          25591 |        25560 |
  ------------------------------------------------------
  | 7.6 GB   |       1 |           6224 |         7469 | + 20.0%
  | 7.6 GB   |       2 |          13611 |        12778 |
  | 7.6 GB   |       4 |          33108 |        32927 |
  | 7.6 GB   |       8 |          34712 |        34878 |
  | 7.6 GB   |      16 |          32895 |        33003 |
  | 7.6 GB   |      20 |          31689 |        31974 |
  | 7.6 GB   |      30 |          29003 |        28806 |
  | 7.6 GB   |      40 |          26683 |        26976 |
  | 7.6 GB   |      50 |          25925 |        25652 |
  ------------------------------------------------------

For the aim7 worloads, they overall improved on top of Michel's
patchset.  For full graphs on how the rwsem series plus this patch
behaves on a large 8 socket machine against a vanilla kernel:

  http://stgolabs.net/rwsem-aim7-results.tar.gz

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rwsem.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/rwsem.c b/lib/rwsem.c
index cf0ad2ad19f5..19c5fa95e0b4 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -223,7 +223,9 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 			count = RWSEM_ACTIVE_WRITE_BIAS;
 			if (!list_is_singular(&sem->wait_list))
 				count += RWSEM_WAITING_BIAS;
-			if (cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
+
+			if (sem->count == RWSEM_WAITING_BIAS &&
+			    cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
 							RWSEM_WAITING_BIAS)
 				break;
 		}
-- 
cgit v1.2.3