From d3091298570006fa538ec9beacbfb1098964962e Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Fri, 16 May 2014 23:26:05 +0200
Subject: sparc: fix sparse warnings in smp_32.c + smp_64.c

Fix following warnings:
smp_32.c:177:5: warning: symbol 'setup_profiling_timer' was not declared. Should it be static?
smp_64.c:1202:5: warning: symbol 'setup_profiling_timer' was not declared. Should it be static?
smp_64.c:989:6: warning: symbol 'kgdb_roundup_cpus' was not declared. Should it be static?

Add prototype to include/linux/profile.h of setup_profiling_timer
Add missing include to smp_64.c

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/profile.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/profile.h b/include/linux/profile.h
index aaad3861beb8..b537a25ffa17 100644
--- a/include/linux/profile.h
+++ b/include/linux/profile.h
@@ -44,6 +44,7 @@ extern int prof_on __read_mostly;
 int profile_init(void);
 int profile_setup(char *str);
 void profile_tick(int type);
+int setup_profiling_timer(unsigned int multiplier);
 
 /*
  * Add multiple profiler hits to a given address:
-- 
cgit v1.2.3


From b14903e10a06347234b387f7364f86aa07252d9f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 4 Jun 2014 16:46:00 +0200
Subject: regulator: add regulator_can_change_voltage stub

When CONFIG_REGULATOR is not set, we cannot call
regulator_can_change_voltage() from a device driver, which results
in a build error like

video/fbdev/omap2/dss/hdmi5.c: In function 'hdmi_init_regulator':
video/fbdev/omap2/dss/hdmi5.c:149:2: error: implicit declaration of function 'regulator_can_change_voltage' [-Werror=implicit-function-declaration]

even for drivers that don't require the regulator API normally.
Such a use was recently added in the omap2+ hdmi driver.

This avoids the problem by adding a static inline function
stub in the API header, as we have for most of the other
regulator functions as well.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Mark Brown <broonie@kernel.org>
Cc: Tomi Valkeinen <tomi.valkeinen@ti.com>
Signed-off-by: Mark Brown <broonie@linaro.org>
---
 include/linux/regulator/consumer.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index e530681bea70..d60b92a7fc25 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -391,6 +391,11 @@ static inline void regulator_bulk_free(int num_consumers,
 {
 }
 
+static inline int regulator_can_change_voltage(struct regulator *regulator)
+{
+	return 0;
+}
+
 static inline int regulator_set_voltage(struct regulator *regulator,
 					int min_uV, int max_uV)
 {
-- 
cgit v1.2.3


From 962bd40bc30e412828e091bfda041b7547e779c8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@poochiereds.net>
Date: Tue, 10 Jun 2014 12:24:40 -0400
Subject: locks: add missing memory barrier in break_deleg

break_deleg is subject to the same potential race as break_lease. Add
a memory barrier to prevent it.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
---
 include/linux/fs.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c3f46e499dd0..22ae79650b82 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1914,6 +1914,12 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 
 static inline int break_deleg(struct inode *inode, unsigned int mode)
 {
+	/*
+	 * Since this check is lockless, we must ensure that any refcounts
+	 * taken are done before checking inode->i_flock. Otherwise, we could
+	 * end up racing with tasks trying to set a new lease on this file.
+	 */
+	smp_mb();
 	if (inode->i_flock)
 		return __break_lease(inode, mode, FL_DELEG);
 	return 0;
-- 
cgit v1.2.3


From 2940474af79744411da0cb63b041ad52c57bc443 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 Jun 2014 13:49:23 +0200
Subject: block: remove elv_abort_queue and blk_abort_flushes

elv_abort_queue has no callers, and blk_abort_flushes is only called by
elv_abort_queue.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/elevator.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 4ff262e2bf37..e2a6bd7fb133 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -133,7 +133,6 @@ extern struct request *elv_latter_request(struct request_queue *, struct request
 extern int elv_register_queue(struct request_queue *q);
 extern void elv_unregister_queue(struct request_queue *q);
 extern int elv_may_queue(struct request_queue *, int);
-extern void elv_abort_queue(struct request_queue *);
 extern void elv_completed_request(struct request_queue *, struct request *);
 extern int elv_set_request(struct request_queue *q, struct request *rq,
 			   struct bio *bio, gfp_t gfp_mask);
-- 
cgit v1.2.3


From a6e15a39048ec3229b9a53425f4384f55f6cc1b3 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 13 Jun 2014 13:30:35 -0700
Subject: PM / hibernate: introduce "nohibernate" boot parameter

To support using kernel features that are not compatible with hibernation,
this creates the "nohibernate" kernel boot parameter to disable both
hibernation and resume. This allows hibernation support to be a boot-time
choice instead of only a compile-time choice.

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/suspend.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index f76994b9396c..519064e0c943 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -327,6 +327,7 @@ extern unsigned long get_safe_page(gfp_t gfp_mask);
 extern void hibernation_set_ops(const struct platform_hibernation_ops *ops);
 extern int hibernate(void);
 extern bool system_entering_hibernation(void);
+extern bool hibernation_available(void);
 asmlinkage int swsusp_save(void);
 extern struct pbe *restore_pblist;
 #else /* CONFIG_HIBERNATION */
@@ -339,6 +340,7 @@ static inline void swsusp_unset_page_free(struct page *p) {}
 static inline void hibernation_set_ops(const struct platform_hibernation_ops *ops) {}
 static inline int hibernate(void) { return -ENOSYS; }
 static inline bool system_entering_hibernation(void) { return false; }
+static inline bool hibernation_available(void) { return false; }
 #endif /* CONFIG_HIBERNATION */
 
 /* Hibernation and suspend events */
-- 
cgit v1.2.3


From 736ed4de766d4f0e8e6142dd4f9d73ef61835ed9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 17 Jun 2014 22:09:29 -0700
Subject: block: blk_max_size_offset() should check ->max_sectors

Commit 762380ad9322 inadvertently changed a check for max_sectors
to max_hw_sectors. Revert that part, so we still compare against
max_sectors.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 31e11051f1ba..713f8b62b435 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -920,7 +920,7 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q,
 					       sector_t offset)
 {
 	if (!q->limits.chunk_sectors)
-		return q->limits.max_hw_sectors;
+		return q->limits.max_sectors;
 
 	return q->limits.chunk_sectors -
 			(offset & (q->limits.chunk_sectors - 1));
-- 
cgit v1.2.3


From 8537b12034cf1fd3fab3da2c859d71f76846fae9 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@redhat.com>
Date: Tue, 17 Jun 2014 22:12:35 -0700
Subject: blk-mq: bitmap tag: fix races on shared ::wake_index fields

Fix racy updates of shared blk_mq_bitmap_tags::wake_index
and blk_mq_hw_ctx::wake_index fields.

Cc: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a002cf191427..eb726b9c5762 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -42,7 +42,7 @@ struct blk_mq_hw_ctx {
 	unsigned int		nr_ctx;
 	struct blk_mq_ctx	**ctxs;
 
-	unsigned int		wait_index;
+	atomic_t		wait_index;
 
 	struct blk_mq_tags	*tags;
 
-- 
cgit v1.2.3


From 2b8f2a28eac1d35a432705d269f02bdaeba9be8f Mon Sep 17 00:00:00 2001
From: Daniel Mack <zonque@gmail.com>
Date: Wed, 18 Jun 2014 11:01:41 +0200
Subject: net: phylib: add link_change_notify callback to phy device

Add a notify callback to inform phy drivers when the core is about to
do its link adjustment. No change for drivers that do not implement
this callback.

Signed-off-by: Daniel Mack <zonque@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 864ddafad8cc..68041446c450 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -536,6 +536,15 @@ struct phy_driver {
 	/* See set_wol, but for checking whether Wake on LAN is enabled. */
 	void (*get_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol);
 
+	/*
+	 * Called to inform a PHY device driver when the core is about to
+	 * change the link state. This callback is supposed to be used as
+	 * fixup hook for drivers that need to take action when the link
+	 * state changes. Drivers are by no means allowed to mess with the
+	 * PHY device structure in their implementations.
+	 */
+	void (*link_change_notify)(struct phy_device *dev);
+
 	struct device_driver driver;
 };
 #define to_phy_driver(d) container_of(d, struct phy_driver, driver)
-- 
cgit v1.2.3


From e567bf7112518824830978d644dfb5a991e67d54 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Sun, 22 Jun 2014 16:32:48 -0600
Subject: Revert "block: add __init to elv_register"

This reverts commit b5097e956a4d2919ee248d6481e4204c5568ed5c.

The original commit is buggy, we do use the registration functions
at runtime, for instance when loading IO schedulers through sysfs.

Reported-by: Damien Wyart <damien.wyart@gmail.com>
---
 include/linux/elevator.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index e2a6bd7fb133..45a91474487d 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -143,7 +143,7 @@ extern void elv_drain_elevator(struct request_queue *);
  * io scheduler registration
  */
 extern void __init load_default_elevator_module(void);
-extern int __init elv_register(struct elevator_type *);
+extern int elv_register(struct elevator_type *);
 extern void elv_unregister(struct elevator_type *);
 
 /*
-- 
cgit v1.2.3


From b3acc56bfe1287c6b666e80edc70b89eea2a1a80 Mon Sep 17 00:00:00 2001
From: Petr Tesarik <ptesarik@suse.cz>
Date: Mon, 23 Jun 2014 13:22:03 -0700
Subject: kexec: save PG_head_mask in VMCOREINFO

To allow filtering of huge pages, makedumpfile must be able to identify
them in the dump.  This can be done by checking the appropriate page
flag, so communicate its value to makedumpfile through the VMCOREINFO
interface.

There's only one small catch.  Depending on how many page flags are
available on a given architecture, this bit can be called PG_head or
PG_compound.

I sent a similar patch back in 2012, but Eric Biederman did not like
using an #ifdef.  So, this time I'm adding a common symbol
(PG_head_mask) instead.

See https://lkml.org/lkml/2012/11/28/91 for the previous version.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Shaohua Li <shli@kernel.org>
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 3c545b48aeab..8304959ad336 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -360,6 +360,9 @@ static inline void ClearPageCompound(struct page *page)
 	ClearPageHead(page);
 }
 #endif
+
+#define PG_head_mask ((1L << PG_head))
+
 #else
 /*
  * Reduce page flag use as much as possible by overlapping
-- 
cgit v1.2.3


From f3aca3d09525f87731ba6b892c9b010570bc54b4 Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Mon, 23 Jun 2014 13:22:05 -0700
Subject: nmi: provide the option to issue an NMI back trace to every cpu but
 current

Sometimes it is preferred not to use the trigger_all_cpu_backtrace()
routine when one wants to avoid capturing a back trace for current.  For
instance if one was previously captured recently.

This patch provides a new routine namely
trigger_allbutself_cpu_backtrace() which offers the flexibility to issue
an NMI to every cpu but current and capture a back trace accordingly.

Patch x86 and sparc to support new routine.

[dzickus@redhat.com: add stub in #else clause]
[dzickus@redhat.com: don't print message in single processor case, wrap with get/put_cpu based on Oleg's suggestion]
[sfr@canb.auug.org.au: undo C99ism]
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Mateusz Guzik <mguzik@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nmi.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 6a45fb583ff1..a17ab6398d7c 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -32,15 +32,24 @@ static inline void touch_nmi_watchdog(void)
 #ifdef arch_trigger_all_cpu_backtrace
 static inline bool trigger_all_cpu_backtrace(void)
 {
-	arch_trigger_all_cpu_backtrace();
+	arch_trigger_all_cpu_backtrace(true);
 
 	return true;
 }
+static inline bool trigger_allbutself_cpu_backtrace(void)
+{
+	arch_trigger_all_cpu_backtrace(false);
+	return true;
+}
 #else
 static inline bool trigger_all_cpu_backtrace(void)
 {
 	return false;
 }
+static inline bool trigger_allbutself_cpu_backtrace(void)
+{
+	return false;
+}
 #endif
 
 #ifdef CONFIG_LOCKUP_DETECTOR
-- 
cgit v1.2.3


From ed235875e2ca983197831337a986f0517074e1a0 Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Mon, 23 Jun 2014 13:22:05 -0700
Subject: kernel/watchdog.c: print traces for all cpus on lockup detection

A 'softlockup' is defined as a bug that causes the kernel to loop in
kernel mode for more than a predefined period to time, without giving
other tasks a chance to run.

Currently, upon detection of this condition by the per-cpu watchdog
task, debug information (including a stack trace) is sent to the system
log.

On some occasions, we have observed that the "victim" rather than the
actual "culprit" (i.e.  the owner/holder of the contended resource) is
reported to the user.  Often this information has proven to be
insufficient to assist debugging efforts.

To avoid loss of useful debug information, for architectures which
support NMI, this patch makes it possible to improve soft lockup
reporting.  This is accomplished by issuing an NMI to each cpu to obtain
a stack trace.

If NMI is not supported we just revert back to the old method.  A sysctl
and boot-time parameter is available to toggle this feature.

[dzickus@redhat.com: add CONFIG_SMP in certain areas]
[akpm@linux-foundation.org: additional CONFIG_SMP=n optimisations]
[mq@suse.cz: fix warning]
Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Mateusz Guzik <mguzik@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Jan Moskyto Matejka <mq@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nmi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index a17ab6398d7c..447775ee2c4b 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -57,6 +57,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *);
 u64 hw_nmi_get_sample_period(int watchdog_thresh);
 extern int watchdog_user_enabled;
 extern int watchdog_thresh;
+extern int sysctl_softlockup_all_cpu_backtrace;
 struct ctl_table;
 extern int proc_dowatchdog(struct ctl_table *, int ,
 			   void __user *, size_t *, loff_t *);
-- 
cgit v1.2.3


From 3a4b0eda8e4b27e6aca86f9f4d327c1070815e30 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Tue, 24 Jun 2014 18:10:26 +0800
Subject: bio: remove unused macro bip_vec_idx()

Macro bip_vec_idx() was used by bio integrity originally, but no longer
used now. So remove it.

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/bio.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5a645769f020..f91decbca96b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -644,10 +644,6 @@ struct biovec_slab {
 
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 
-
-
-#define bip_vec_idx(bip, idx)	(&(bip->bip_vec[(idx)]))
-
 #define bip_for_each_vec(bvl, bip, iter)				\
 	for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter)
 
-- 
cgit v1.2.3


From 66cb45aa41315d1d9972cada354fbdf7870d7714 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 24 Jun 2014 16:22:24 -0600
Subject: block: add support for limiting gaps in SG lists

Another restriction inherited for NVMe - those devices don't support
SG lists that have "gaps" in them. Gaps refers to cases where the
previous SG entry doesn't end on a page boundary. For NVMe, all SG
entries must start at offset 0 (except the first) and end on a page
boundary (except the last).

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/bio.h    | 9 +++++++++
 include/linux/blkdev.h | 1 +
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index f91decbca96b..d2633ee099d9 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -186,6 +186,15 @@ static inline void *bio_data(struct bio *bio)
 #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \
 	__BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, queue_segment_boundary((q)))
 
+/*
+ * Check if adding a bio_vec after bprv with offset would create a gap in
+ * the SG list. Most drivers don't care about this, but some do.
+ */
+static inline bool bvec_gap_to_prev(struct bio_vec *bprv, unsigned int offset)
+{
+	return offset || ((bprv->bv_offset + bprv->bv_len) & (PAGE_SIZE - 1));
+}
+
 #define bio_io_error(bio) bio_endio((bio), -EIO)
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 713f8b62b435..8699bcf5f099 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -512,6 +512,7 @@ struct request_queue {
 #define QUEUE_FLAG_DEAD        19	/* queue tear-down finished */
 #define QUEUE_FLAG_INIT_DONE   20	/* queue is initialized */
 #define QUEUE_FLAG_NO_SG_MERGE 21	/* don't attempt to merge SG segments*/
+#define QUEUE_FLAG_SG_GAPS     22	/* queue doesn't support SG gaps */
 
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
-- 
cgit v1.2.3


From 0b86dbf675e0170a191a9ca18e5e99fd39a678c0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Mon, 23 Jun 2014 08:44:40 +0100
Subject: Fix 32-bit regression in block device read(2)

blkdev_read_iter() wants to cap the iov_iter by the amount of data
remaining to the end of device.  That's what iov_iter_truncate() is for
(trim iter->count if it's above the given limit).  So far, so good, but
the argument of iov_iter_truncate() is size_t, so on 32bit boxen (in
case of a large device) we end up with that upper limit truncated down
to 32 bits *before* comparing it with iter->count.

Easily fixed by making iov_iter_truncate() take 64bit argument - it does
the right thing after such change (we only reach the assignment in there
when the current value of iter->count is greater than the limit, i.e.
for anything that would get truncated we don't reach the assignment at
all) and that argument is not the new value of iter->count - it's an
upper limit for such.

The overhead of passing u64 is not an issue - the thing is inlined, so
callers passing size_t won't pay any penalty.

Reported-and-tested-by: Theodore Tso <tytso@mit.edu>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Tested-by: Bruno Wolff III <bruno@wolff.to>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/uio.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index e2231e47cec1..d54985e0705e 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -94,8 +94,20 @@ static inline size_t iov_iter_count(struct iov_iter *i)
 	return i->count;
 }
 
-static inline void iov_iter_truncate(struct iov_iter *i, size_t count)
+/*
+ * Cap the iov_iter by given limit; note that the second argument is
+ * *not* the new size - it's upper limit for such.  Passing it a value
+ * greater than the amount of data in iov_iter is fine - it'll just do
+ * nothing in that case.
+ */
+static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
 {
+	/*
+	 * count doesn't have to fit in size_t - comparison extends both
+	 * operands to u64 here and any value that would be truncated by
+	 * conversion in assignement is by definition greater than all
+	 * values of size_t, including old i->count.
+	 */
 	if (i->count > count)
 		i->count = count;
 }
-- 
cgit v1.2.3


From ac5ccdba3a1659b3517e7e99ef7d35a6a2d77cf4 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 19 Jun 2014 21:22:56 +0300
Subject: iovec: move memcpy_from/toiovecend to lib/iovec.c

ERROR: "memcpy_fromiovecend" [drivers/vhost/vhost_scsi.ko] undefined!

commit 9f977ef7b671f6169eca78bf40f230fe84b7c7e5
    vhost-scsi: Include prot_bytes into expected data transfer length
in target-pending makes drivers/vhost/scsi.c call memcpy_fromiovecend().
This function is not available when CONFIG_NET is not enabled.

socket.h already includes uio.h, so no callers need updating.

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 include/linux/socket.h | 4 ----
 include/linux/uio.h    | 5 ++++-
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 8e98297f1388..ec538fc287a6 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -305,8 +305,6 @@ struct ucred {
 /* IPX options */
 #define IPX_TYPE	1
 
-extern int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
-			       int offset, int len);
 extern int csum_partial_copy_fromiovecend(unsigned char *kdata, 
 					  struct iovec *iov, 
 					  int offset, 
@@ -315,8 +313,6 @@ extern unsigned long iov_pages(const struct iovec *iov, int offset,
 			       unsigned long nr_segs);
 
 extern int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *address, int mode);
-extern int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata,
-			     int offset, int len);
 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr);
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
 
diff --git a/include/linux/uio.h b/include/linux/uio.h
index e2231e47cec1..04c8c4bb4927 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -111,6 +111,9 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
 
 int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
 int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len);
-
+int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
+			int offset, int len);
+int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata,
+		      int offset, int len);
 
 #endif
-- 
cgit v1.2.3


From 4e26445faad366d67d7723622bf6a60a6f0f5993 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Mon, 30 Jun 2014 11:50:28 +0800
Subject: kernfs: introduce kernfs_pin_sb()

kernfs_pin_sb() tries to get a refcnt of the superblock.

This will be used by cgroupfs.

v2:
- make kernfs_pin_sb() return the superblock.
- drop kernfs_drop_sb().

tj: Updated the comment a bit.

[ This is a prerequisite for a bugfix. ]
Cc: <stable@vger.kernel.org> # 3.15
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/kernfs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 17aa1cce6f8e..20f493564917 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -304,6 +304,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
 			       struct kernfs_root *root, unsigned long magic,
 			       bool *new_sb_created, const void *ns);
 void kernfs_kill_sb(struct super_block *sb);
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns);
 
 void kernfs_init(void);
 
-- 
cgit v1.2.3


From b14bf2d0c0358140041d1c1805a674376964d0e0 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 30 Jun 2014 11:04:21 -0400
Subject: usb-storage/SCSI: Add broken_fua blacklist flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some buggy JMicron USB-ATA bridges don't know how to translate the FUA
bit in READs or WRITEs.  This patch adds an entry in unusual_devs.h
and a blacklist flag to tell the sd driver not to use FUA.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Reported-by: Michael Büsch <m@bues.ch>
Tested-by: Michael Büsch <m@bues.ch>
Acked-by: James Bottomley <James.Bottomley@HansenPartnership.com>
CC: Matthew Dharm <mdharm-usb@one-eyed-alien.net>
CC: <stable@vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb_usual.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/usb_usual.h b/include/linux/usb_usual.h
index 1a64b26046ed..9b7de1b46437 100644
--- a/include/linux/usb_usual.h
+++ b/include/linux/usb_usual.h
@@ -70,7 +70,9 @@
 	US_FLAG(NEEDS_CAP16,	0x00400000)			\
 		/* cannot handle READ_CAPACITY_10 */		\
 	US_FLAG(IGNORE_UAS,	0x00800000)			\
-		/* Device advertises UAS but it is broken */
+		/* Device advertises UAS but it is broken */	\
+	US_FLAG(BROKEN_FUA,	0x01000000)			\
+		/* Cannot handle FUA in WRITE or READ CDBs */	\
 
 #define US_FLAG(name, value)	US_FL_##name = value ,
 enum { US_DO_ALL_FLAGS };
-- 
cgit v1.2.3


From 330d282216d6e4d845a21b72572dc4df4122e8fa Mon Sep 17 00:00:00 2001
From: Zhengyu He <hzy@google.com>
Date: Tue, 1 Jul 2014 12:11:47 -0700
Subject: core: fix typo in percpu read_mostly section

This fixes a typo that named the read_mostly section of percpu as
readmostly. It works fine with SMP because the linker script specifies
.data..percpu..readmostly. However, UP kernel builds don't have percpu
sections defined and the non-percpu version of the section is called
data..read_mostly, so .data..readmostly will float around and may break
things unexpectedly.

Looking at the original change that introduced data..percpu..readmostly
(commit c957ef2c59e952803766ddc22e89981ab534606f), it looks like this
was the original intention.

Tested: Built UP kernel and confirmed the sections got merged.

- Before the patch:
$ objdump -h vmlinux.o  | grep '\.data\.\.read.*mostly'
38 .data..read_mostly 00004418  0000000000000000  0000000000000000  00431ac0  2**6
50 .data..readmostly 00000014  0000000000000000  0000000000000000  00444000  2**3

- After the patch:
$ objdump -h vmlinux.o  | grep '\.data\.\.read.*mostly'
38 .data..read_mostly 00004438  0000000000000000  0000000000000000  00431ac0  2**6

Signed-off-by: Zhengyu He <hzy@google.com>
Signed-off-by: Filipe Brandenburger <filbranden@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu-defs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index a5fc7d01aad6..dec01d6c3f80 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -146,10 +146,10 @@
  * Declaration/definition used for per-CPU variables that must be read mostly.
  */
 #define DECLARE_PER_CPU_READ_MOSTLY(type, name)			\
-	DECLARE_PER_CPU_SECTION(type, name, "..readmostly")
+	DECLARE_PER_CPU_SECTION(type, name, "..read_mostly")
 
 #define DEFINE_PER_CPU_READ_MOSTLY(type, name)				\
-	DEFINE_PER_CPU_SECTION(type, name, "..readmostly")
+	DEFINE_PER_CPU_SECTION(type, name, "..read_mostly")
 
 /*
  * Intermodule exports for per-CPU variables.  sparse forgets about
-- 
cgit v1.2.3


From ecca47ce8294843045e7465d76fee84dbf07a004 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 1 Jul 2014 16:41:03 -0400
Subject: kernfs: kernfs_notify() must be useable from non-sleepable contexts

d911d9874801 ("kernfs: make kernfs_notify() trigger inotify events
too") added fsnotify triggering to kernfs_notify() which requires a
sleepable context.  There are already existing users of
kernfs_notify() which invoke it from an atomic context and in general
it's silly to require a sleepable context for triggering a
notification.

The following is an invalid context bug triggerd by md invoking
sysfs_notify() from IO completion path.

 BUG: sleeping function called from invalid context at kernel/locking/mutex.c:586
 in_atomic(): 1, irqs_disabled(): 1, pid: 0, name: swapper/1
 2 locks held by swapper/1/0:
  #0:  (&(&vblk->vq_lock)->rlock){-.-...}, at: [<ffffffffa0039042>] virtblk_done+0x42/0xe0 [virtio_blk]
  #1:  (&(&bitmap->counts.lock)->rlock){-.....}, at: [<ffffffff81633718>] bitmap_endwrite+0x68/0x240
 irq event stamp: 33518
 hardirqs last  enabled at (33515): [<ffffffff8102544f>] default_idle+0x1f/0x230
 hardirqs last disabled at (33516): [<ffffffff818122ed>] common_interrupt+0x6d/0x72
 softirqs last  enabled at (33518): [<ffffffff810a1272>] _local_bh_enable+0x22/0x50
 softirqs last disabled at (33517): [<ffffffff810a29e0>] irq_enter+0x60/0x80
 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.16.0-0.rc2.git2.1.fc21.x86_64 #1
 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
  0000000000000000 f90db13964f4ee05 ffff88007d403b80 ffffffff81807b4c
  0000000000000000 ffff88007d403ba8 ffffffff810d4f14 0000000000000000
  0000000000441800 ffff880078fa1780 ffff88007d403c38 ffffffff8180caf2
 Call Trace:
  <IRQ>  [<ffffffff81807b4c>] dump_stack+0x4d/0x66
  [<ffffffff810d4f14>] __might_sleep+0x184/0x240
  [<ffffffff8180caf2>] mutex_lock_nested+0x42/0x440
  [<ffffffff812d76a0>] kernfs_notify+0x90/0x150
  [<ffffffff8163377c>] bitmap_endwrite+0xcc/0x240
  [<ffffffffa00de863>] close_write+0x93/0xb0 [raid1]
  [<ffffffffa00df029>] r1_bio_write_done+0x29/0x50 [raid1]
  [<ffffffffa00e0474>] raid1_end_write_request+0xe4/0x260 [raid1]
  [<ffffffff813acb8b>] bio_endio+0x6b/0xa0
  [<ffffffff813b46c4>] blk_update_request+0x94/0x420
  [<ffffffff813bf0ea>] blk_mq_end_io+0x1a/0x70
  [<ffffffffa00392c2>] virtblk_request_done+0x32/0x80 [virtio_blk]
  [<ffffffff813c0648>] __blk_mq_complete_request+0x88/0x120
  [<ffffffff813c070a>] blk_mq_complete_request+0x2a/0x30
  [<ffffffffa0039066>] virtblk_done+0x66/0xe0 [virtio_blk]
  [<ffffffffa002535a>] vring_interrupt+0x3a/0xa0 [virtio_ring]
  [<ffffffff81116177>] handle_irq_event_percpu+0x77/0x340
  [<ffffffff8111647d>] handle_irq_event+0x3d/0x60
  [<ffffffff81119436>] handle_edge_irq+0x66/0x130
  [<ffffffff8101c3e4>] handle_irq+0x84/0x150
  [<ffffffff818146ad>] do_IRQ+0x4d/0xe0
  [<ffffffff818122f2>] common_interrupt+0x72/0x72
  <EOI>  [<ffffffff8105f706>] ? native_safe_halt+0x6/0x10
  [<ffffffff81025454>] default_idle+0x24/0x230
  [<ffffffff81025f9f>] arch_cpu_idle+0xf/0x20
  [<ffffffff810f5adc>] cpu_startup_entry+0x37c/0x7b0
  [<ffffffff8104df1b>] start_secondary+0x25b/0x300

This patch fixes it by punting the notification delivery through a
work item.  This ends up adding an extra pointer to kernfs_elem_attr
enlarging kernfs_node by a pointer, which is not ideal but not a very
big deal either.  If this turns out to be an actual issue, we can move
kernfs_elem_attr->size to kernfs_node->iattr later.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Josh Boyer <jwboyer@fedoraproject.org>
Cc: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kernfs.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 17aa1cce6f8e..145375ea0bd9 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -91,6 +91,7 @@ struct kernfs_elem_attr {
 	const struct kernfs_ops	*ops;
 	struct kernfs_open_node	*open;
 	loff_t			size;
+	struct kernfs_node	*notify_next;	/* for kernfs_notify() */
 };
 
 /*
-- 
cgit v1.2.3


From b9cd18de4db3c9ffa7e17b0dc0ca99ed5aa4d43a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 3 Jul 2014 15:43:15 -0400
Subject: ptrace,x86: force IRET path after a ptrace_stop()

The 'sysret' fastpath does not correctly restore even all regular
registers, much less any segment registers or reflags values.  That is
very much part of why it's faster than 'iret'.

Normally that isn't a problem, because the normal ptrace() interface
catches the process using the signal handler infrastructure, which
always returns with an iret.

However, some paths can get caught using ptrace_event() instead of the
signal path, and for those we need to make sure that we aren't going to
return to user space using 'sysret'.  Otherwise the modifications that
may have been done to the register set by the tracer wouldn't
necessarily take effect.

Fix it by forcing IRET path by setting TIF_NOTIFY_RESUME from
arch_ptrace_stop_needed() which is invoked from ptrace_stop().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Andy Lutomirski <luto@amacapital.net>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 077904c8b70d..cc79eff4a1ad 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -334,6 +334,9 @@ static inline void user_single_step_siginfo(struct task_struct *tsk,
  * calling arch_ptrace_stop() when it would be superfluous.  For example,
  * if the thread has not been back to user mode since the last stop, the
  * thread state might indicate that nothing needs to be done.
+ *
+ * This is guaranteed to be invoked once before a task stops for ptrace and
+ * may include arch-specific operations necessary prior to a ptrace stop.
  */
 #define arch_ptrace_stop_needed(code, info)	(0)
 #endif
-- 
cgit v1.2.3


From 046a619d8e9746fa4c0e29e8c6b78e16efc008a8 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Mon, 14 Jul 2014 10:27:48 -0700
Subject: locking/spinlocks/mcs: Rename optimistic_spin_queue() to
 optimistic_spin_node()

Currently, the per-cpu nodes structure for the cancellable MCS spinlock is
named "optimistic_spin_queue". However, in a follow up patch in the series
we will be introducing a new structure that serves as the new "handle" for
the lock. It would make more sense if that structure is named
"optimistic_spin_queue". Additionally, since the current use of the
"optimistic_spin_queue" structure are  "nodes", it might be better if we
rename them to "node" anyway.

This preparatory patch renames all current "optimistic_spin_queue"
to "optimistic_spin_node".

Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Scott Norton <scott.norton@hp.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Waiman Long <waiman.long@hp.com>
Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Chris Mason <clm@fb.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Josef Bacik <jbacik@fusionio.com>
Link: http://lkml.kernel.org/r/1405358872-3732-2-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h | 4 ++--
 include/linux/rwsem.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 11692dea18aa..885f3f56a77f 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -46,7 +46,7 @@
  * - detects multi-task circular deadlocks and prints out all affected
  *   locks and tasks (and only those tasks)
  */
-struct optimistic_spin_queue;
+struct optimistic_spin_node;
 struct mutex {
 	/* 1: unlocked, 0: locked, negative: locked, possible waiters */
 	atomic_t		count;
@@ -56,7 +56,7 @@ struct mutex {
 	struct task_struct	*owner;
 #endif
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-	struct optimistic_spin_queue	*osq;	/* Spinner MCS lock */
+	struct optimistic_spin_node	*osq;	/* Spinner MCS lock */
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
 	const char 		*name;
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8d79708146aa..ba3f108ddea1 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -16,7 +16,7 @@
 
 #include <linux/atomic.h>
 
-struct optimistic_spin_queue;
+struct optimistic_spin_node;
 struct rw_semaphore;
 
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -33,7 +33,7 @@ struct rw_semaphore {
 	 * if the owner is running on the cpu.
 	 */
 	struct task_struct *owner;
-	struct optimistic_spin_queue *osq; /* spinner MCS lock */
+	struct optimistic_spin_node *osq; /* spinner MCS lock */
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
-- 
cgit v1.2.3


From 90631822c5d307b5410500806e8ac3e63928aa3e Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Mon, 14 Jul 2014 10:27:49 -0700
Subject: locking/spinlocks/mcs: Convert osq lock to atomic_t to reduce
 overhead

The cancellable MCS spinlock is currently used to queue threads that are
doing optimistic spinning. It uses per-cpu nodes, where a thread obtaining
the lock would access and queue the local node corresponding to the CPU that
it's running on. Currently, the cancellable MCS lock is implemented by using
pointers to these nodes.

In this patch, instead of operating on pointers to the per-cpu nodes, we
store the CPU numbers in which the per-cpu nodes correspond to in atomic_t.
A similar concept is used with the qspinlock.

By operating on the CPU # of the nodes using atomic_t instead of pointers
to those nodes, this can reduce the overhead of the cancellable MCS spinlock
by 32 bits (on 64 bit systems).

Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Scott Norton <scott.norton@hp.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Waiman Long <waiman.long@hp.com>
Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Chris Mason <clm@fb.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Josef Bacik <jbacik@fusionio.com>
Link: http://lkml.kernel.org/r/1405358872-3732-3-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h    |  4 ++--
 include/linux/osq_lock.h | 19 +++++++++++++++++++
 include/linux/rwsem.h    |  7 +++----
 3 files changed, 24 insertions(+), 6 deletions(-)
 create mode 100644 include/linux/osq_lock.h

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 885f3f56a77f..42aa9b9ecd5f 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -17,6 +17,7 @@
 #include <linux/lockdep.h>
 #include <linux/atomic.h>
 #include <asm/processor.h>
+#include <linux/osq_lock.h>
 
 /*
  * Simple, straightforward mutexes with strict semantics:
@@ -46,7 +47,6 @@
  * - detects multi-task circular deadlocks and prints out all affected
  *   locks and tasks (and only those tasks)
  */
-struct optimistic_spin_node;
 struct mutex {
 	/* 1: unlocked, 0: locked, negative: locked, possible waiters */
 	atomic_t		count;
@@ -56,7 +56,7 @@ struct mutex {
 	struct task_struct	*owner;
 #endif
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-	struct optimistic_spin_node	*osq;	/* Spinner MCS lock */
+	struct optimistic_spin_queue osq; /* Spinner MCS lock */
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
 	const char 		*name;
diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
new file mode 100644
index 000000000000..b001682bf7cb
--- /dev/null
+++ b/include/linux/osq_lock.h
@@ -0,0 +1,19 @@
+#ifndef __LINUX_OSQ_LOCK_H
+#define __LINUX_OSQ_LOCK_H
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ */
+
+#define OSQ_UNLOCKED_VAL (0)
+
+struct optimistic_spin_queue {
+	/*
+	 * Stores an encoded value of the CPU # of the tail node in the queue.
+	 * If the queue is empty, then it's set to OSQ_UNLOCKED_VAL.
+	 */
+	atomic_t tail;
+};
+
+#endif
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index ba3f108ddea1..9fdcdd03507d 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -13,10 +13,9 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
-
 #include <linux/atomic.h>
+#include <linux/osq_lock.h>
 
-struct optimistic_spin_node;
 struct rw_semaphore;
 
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -33,7 +32,7 @@ struct rw_semaphore {
 	 * if the owner is running on the cpu.
 	 */
 	struct task_struct *owner;
-	struct optimistic_spin_node *osq; /* spinner MCS lock */
+	struct optimistic_spin_queue osq; /* spinner MCS lock */
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
@@ -70,7 +69,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 	  __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),	\
 	  LIST_HEAD_INIT((name).wait_list),		\
 	  NULL, /* owner */				\
-	  NULL /* mcs lock */                           \
+	  { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } /* osq */   \
 	  __RWSEM_DEP_MAP_INIT(name) }
 #else
 #define __RWSEM_INITIALIZER(name)			\
-- 
cgit v1.2.3


From 4d9d951e6b5df85ccfca2c5bd8b4f5c71d256b65 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Mon, 14 Jul 2014 10:27:50 -0700
Subject: locking/spinlocks/mcs: Introduce and use init macro and function for
 osq locks

Currently, we initialize the osq lock by directly setting the lock's values. It
would be preferable if we use an init macro to do the initialization like we do
with other locks.

This patch introduces and uses a macro and function for initializing the osq lock.

Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Scott Norton <scott.norton@hp.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Waiman Long <waiman.long@hp.com>
Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <jbacik@fusionio.com>
Link: http://lkml.kernel.org/r/1405358872-3732-4-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/osq_lock.h | 8 ++++++++
 include/linux/rwsem.h    | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
index b001682bf7cb..90230d5811c5 100644
--- a/include/linux/osq_lock.h
+++ b/include/linux/osq_lock.h
@@ -16,4 +16,12 @@ struct optimistic_spin_queue {
 	atomic_t tail;
 };
 
+/* Init macro and function. */
+#define OSQ_LOCK_UNLOCKED { ATOMIC_INIT(OSQ_UNLOCKED_VAL) }
+
+static inline void osq_lock_init(struct optimistic_spin_queue *lock)
+{
+	atomic_set(&lock->tail, OSQ_UNLOCKED_VAL);
+}
+
 #endif
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 9fdcdd03507d..25cd9aa2f3d7 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -69,7 +69,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 	  __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),	\
 	  LIST_HEAD_INIT((name).wait_list),		\
 	  NULL, /* owner */				\
-	  { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } /* osq */   \
+	  OSQ_LOCK_UNLOCKED /* osq */			\
 	  __RWSEM_DEP_MAP_INIT(name) }
 #else
 #define __RWSEM_INITIALIZER(name)			\
-- 
cgit v1.2.3


From 13b9a962a2594ee880c5d50d7f70964da1d4fe5a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 16 Jul 2014 14:54:55 +0200
Subject: locking/rwsem: Rename 'activity' to 'count'

There are two definitions of struct rw_semaphore, one in linux/rwsem.h
and one in linux/rwsem-spinlock.h.

For some reason they have different names for the initial field. This
makes it impossible to use C99 named initialization for
__RWSEM_INITIALIZER() -- or we have to duplicate that entire thing
along with the structure definitions.

The simpler patch is renaming the rwsem-spinlock variant to match the
regular rwsem.

This allows us to switch to C99 named initialization.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-bmrZolsbGmautmzrerog27io@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwsem-spinlock.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index d5b13bc07a0b..561e8615528d 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -15,13 +15,13 @@
 #ifdef __KERNEL__
 /*
  * the rw-semaphore definition
- * - if activity is 0 then there are no active readers or writers
- * - if activity is +ve then that is the number of active readers
- * - if activity is -1 then there is one active writer
+ * - if count is 0 then there are no active readers or writers
+ * - if count is +ve then that is the number of active readers
+ * - if count is -1 then there is one active writer
  * - if wait_list is not empty, then there are processes waiting for the semaphore
  */
 struct rw_semaphore {
-	__s32			activity;
+	__s32			count;
 	raw_spinlock_t		wait_lock;
 	struct list_head	wait_list;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-- 
cgit v1.2.3


From ce069fc920e5734558b3d9cbef1ab06cf01ee793 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Mon, 14 Jul 2014 10:27:52 -0700
Subject: locking/rwsem: Reduce the size of struct rw_semaphore

Recent optimistic spinning additions to rwsem provide significant performance
benefits on many workloads on large machines. The cost of it was increasing
the size of the rwsem structure by up to 128 bits.

However, now that the previous patches in this series bring the overhead of
struct optimistic_spin_queue to 32 bits, this patch reorders some fields in
struct rw_semaphore such that we can reduce the overhead of the rwsem structure
by 64 bits (on 64 bit systems).

The extra overhead required for rwsem optimistic spinning would now be up
to 8 additional bytes instead of up to 16 bytes. Additionally, the size of
rwsem would now be more in line with mutexes.

Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Scott Norton <scott.norton@hp.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Waiman Long <waiman.long@hp.com>
Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <jbacik@fusionio.com>
Link: http://lkml.kernel.org/r/1405358872-3732-6-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwsem.h | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 25cd9aa2f3d7..716807f0eb2d 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -24,15 +24,15 @@ struct rw_semaphore;
 /* All arch specific implementations share the same struct */
 struct rw_semaphore {
 	long count;
-	raw_spinlock_t wait_lock;
 	struct list_head wait_list;
+	raw_spinlock_t wait_lock;
 #ifdef CONFIG_SMP
+	struct optimistic_spin_queue osq; /* spinner MCS lock */
 	/*
 	 * Write owner. Used as a speculative check to see
 	 * if the owner is running on the cpu.
 	 */
 	struct task_struct *owner;
-	struct optimistic_spin_queue osq; /* spinner MCS lock */
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
@@ -64,21 +64,18 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 #endif
 
 #if defined(CONFIG_SMP) && !defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
-#define __RWSEM_INITIALIZER(name)			\
-	{ RWSEM_UNLOCKED_VALUE,				\
-	  __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),	\
-	  LIST_HEAD_INIT((name).wait_list),		\
-	  NULL, /* owner */				\
-	  OSQ_LOCK_UNLOCKED /* osq */			\
-	  __RWSEM_DEP_MAP_INIT(name) }
+#define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED, .owner = NULL
 #else
-#define __RWSEM_INITIALIZER(name)			\
-	{ RWSEM_UNLOCKED_VALUE,				\
-	  __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),	\
-	  LIST_HEAD_INIT((name).wait_list)		\
-	  __RWSEM_DEP_MAP_INIT(name) }
+#define __RWSEM_OPT_INIT(lockname)
 #endif
 
+#define __RWSEM_INITIALIZER(name)				\
+	{ .count = RWSEM_UNLOCKED_VALUE,			\
+	  .wait_list = LIST_HEAD_INIT((name).wait_list),	\
+	  .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock)	\
+	  __RWSEM_OPT_INIT(name)				\
+	  __RWSEM_DEP_MAP_INIT(name) }
+
 #define DECLARE_RWSEM(name) \
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-- 
cgit v1.2.3


From 5db6c6fefb1ca0e81e3bd6dd8998bf51c453d823 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Fri, 11 Jul 2014 14:00:06 -0700
Subject: locking/rwsem: Add CONFIG_RWSEM_SPIN_ON_OWNER

Just like with mutexes (CONFIG_MUTEX_SPIN_ON_OWNER),
encapsulate the dependencies for rwsem optimistic spinning.
No logical changes here as it continues to depend on both
SMP and the XADD algorithm variant.

Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Acked-by: Jason Low <jason.low2@hp.com>
[ Also make it depend on ARCH_SUPPORTS_ATOMIC_RMW. ]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1405112406-13052-2-git-send-email-davidlohr@hp.com
Cc: aswin@hp.com
Cc: Chris Mason <clm@fb.com>
Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: Josef Bacik <jbacik@fusionio.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwsem.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 716807f0eb2d..035d3c57fc8a 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -14,7 +14,9 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/atomic.h>
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 #include <linux/osq_lock.h>
+#endif
 
 struct rw_semaphore;
 
@@ -26,7 +28,7 @@ struct rw_semaphore {
 	long count;
 	struct list_head wait_list;
 	raw_spinlock_t wait_lock;
-#ifdef CONFIG_SMP
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	struct optimistic_spin_queue osq; /* spinner MCS lock */
 	/*
 	 * Write owner. Used as a speculative check to see
@@ -63,7 +65,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 # define __RWSEM_DEP_MAP_INIT(lockname)
 #endif
 
-#if defined(CONFIG_SMP) && !defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 #define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED, .owner = NULL
 #else
 #define __RWSEM_OPT_INIT(lockname)
-- 
cgit v1.2.3