From 6ada1fc0e1c4775de0e043e1bd3ae9d065491aa5 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Wed, 3 Dec 2014 19:22:48 -0500
Subject: time: settimeofday: Validate the values of tv from user

An unvalidated user input is multiplied by a constant, which can result in
an undefined behaviour for large values. While this is validated later,
we should avoid triggering undefined behaviour.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
[jstultz: include trivial milisecond->microsecond correction noticed
by Andy]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 8c42cf8d2444..5989b0ead1ec 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -99,6 +99,19 @@ static inline bool timespec_valid_strict(const struct timespec *ts)
 	return true;
 }
 
+static inline bool timeval_valid(const struct timeval *tv)
+{
+	/* Dates before 1970 are bogus */
+	if (tv->tv_sec < 0)
+		return false;
+
+	/* Can't have more microseconds then a second */
+	if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC)
+		return false;
+
+	return true;
+}
+
 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 
 #define CURRENT_TIME		(current_kernel_time())
-- 
cgit v1.2.3


From e61f7d1c3c07a7e51036b0796749edb00deff845 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Thu, 8 Jan 2015 10:34:27 -0500
Subject: libata: Whitelist SSDs that are known to properly return zeroes after
 TRIM

As defined, the DRAT (Deterministic Read After Trim) and RZAT (Return
Zero After Trim) flags in the ATA Command Set are unreliable in the
sense that they only define what happens if the device successfully
executed the DSM TRIM command. TRIM is only advisory, however, and the
device is free to silently ignore all or parts of the request.

In practice this renders the DRAT and RZAT flags completely useless and
because the results are unpredictable we decided to disable discard in
MD for 3.18 to avoid the risk of data corruption.

Hardware vendors in the real world obviously need better guarantees than
what the standards bodies provide. Unfortuntely those guarantees are
encoded in product requirements documents rather than somewhere we can
key off of them programatically. So we are compelled to disabling
discard_zeroes_data for all devices unless we explicitly have data to
support whitelisting them.

This patch whitelists SSDs from a few of the main vendors. None of the
whitelists are based on written guarantees. They are purely based on
empirical evidence collected from internal and external users that have
tested or qualified these drives in RAID deployments.

The whitelist is only meant as a starting point and is by no means
comprehensive:

   - All intel SSD models except for 510
   - Micron M5?0/M600
   - Samsung SSDs
   - Seagate SSDs

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/libata.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2d182413b1db..f2b440e44fd7 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -422,6 +422,7 @@ enum {
 	ATA_HORKAGE_NO_NCQ_TRIM	= (1 << 19),	/* don't use queued TRIM */
 	ATA_HORKAGE_NOLPM	= (1 << 20),	/* don't use LPM */
 	ATA_HORKAGE_WD_BROKEN_LPM = (1 << 21),	/* some WDs have broken LPM */
+	ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit v1.2.3


From ad26aa6c60974acf3228ed0ade97ba5793093dbe Mon Sep 17 00:00:00 2001
From: Jonghwa Lee <jonghwa3.lee@samsung.com>
Date: Thu, 8 Jan 2015 11:04:07 +0900
Subject: regulator: s2mps11: Fix wrong calculation of register offset

This patch adds missing registers('BUCK7_SW' & 'LDO29_CTRL'). Since BUCK7 has
1 more register (BUCK7_SW) than others, register offset should
be added one more for which has bigger address than BUCK7 registers.

Fixes: 76b9840b24ae04(regulator: s2mps11: Add support S2MPS13 regulator device)
Signed-off-by: Jonghwa Lee <jonghwa3.lee@samsung.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: <stable@vger.kernel.org>
---
 include/linux/mfd/samsung/s2mps13.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/samsung/s2mps13.h b/include/linux/mfd/samsung/s2mps13.h
index ce5dda8958fe..b1fd675fa36f 100644
--- a/include/linux/mfd/samsung/s2mps13.h
+++ b/include/linux/mfd/samsung/s2mps13.h
@@ -59,6 +59,7 @@ enum s2mps13_reg {
 	S2MPS13_REG_B6CTRL,
 	S2MPS13_REG_B6OUT,
 	S2MPS13_REG_B7CTRL,
+	S2MPS13_REG_B7SW,
 	S2MPS13_REG_B7OUT,
 	S2MPS13_REG_B8CTRL,
 	S2MPS13_REG_B8OUT,
@@ -102,6 +103,7 @@ enum s2mps13_reg {
 	S2MPS13_REG_L26CTRL,
 	S2MPS13_REG_L27CTRL,
 	S2MPS13_REG_L28CTRL,
+	S2MPS13_REG_L29CTRL,
 	S2MPS13_REG_L30CTRL,
 	S2MPS13_REG_L31CTRL,
 	S2MPS13_REG_L32CTRL,
-- 
cgit v1.2.3


From f331a859e0ee5a898c1f47596eddad4c4f02d657 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 15 Jan 2015 18:16:04 -0600
Subject: PCI: Add flag for devices where we can't use bus reset

Enable a mechanism for devices to quirk that they do not behave when
doing a PCI bus reset.  We require a modest level of spec compliant
behavior in order to do a reset, for instance the device should come
out of reset without throwing errors and PCI config space should be
accessible after reset.  This is too much to ask for some devices.

Link: http://lkml.kernel.org/r/20140923210318.498dacbd@dualc.maya.org
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
CC: stable@vger.kernel.org	# v3.14+
---
 include/linux/pci.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 360a966a97a5..44627f1df4ca 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -175,6 +175,8 @@ enum pci_dev_flags {
 	PCI_DEV_FLAGS_DMA_ALIAS_DEVFN = (__force pci_dev_flags_t) (1 << 4),
 	/* Use a PCIe-to-PCI bridge alias even if !pci_is_pcie */
 	PCI_DEV_FLAG_PCIE_BRIDGE_ALIAS = (__force pci_dev_flags_t) (1 << 5),
+	/* Do not use bus resets for device */
+	PCI_DEV_FLAGS_NO_BUS_RESET = (__force pci_dev_flags_t) (1 << 6),
 };
 
 enum pci_irq_reroute_variant {
-- 
cgit v1.2.3


From 8505e729a2f6eb0803ff943a15f133dd10afff3a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 15 Jan 2015 16:21:49 -0600
Subject: PCI: Add pci_claim_bridge_resource() to clip window if necessary

Add pci_claim_bridge_resource() to claim a PCI-PCI bridge window.  This is
like regular pci_claim_resource(), except that if we fail to claim the
window, we check to see if we can reduce the size of the window and try
again.

This is for scenarios like this:

  pci_bus 0000:00: root bus resource [mem 0xc0000000-0xffffffff]
  pci 0000:00:01.0:   bridge window [mem 0xbdf00000-0xddefffff 64bit pref]
  pci 0000:01:00.0: reg 0x10: [mem 0xc0000000-0xcfffffff pref]

The 00:01.0 window is illegal: it starts before the host bridge window, so
we have to assume the [0xbdf00000-0xbfffffff] region is inaccessible.  We
can make it legal by clipping it to [mem 0xc0000000-0xddefffff 64bit pref].

Previously we discarded the 00:01.0 window and tried to reassign that part
of the hierarchy from scratch.  That is a problem because Linux doesn't
always assign things optimally.  For example, in this case, BIOS put the
01:00.0 device in a prefetchable window below 4GB, but after 5b28541552ef,
Linux puts the prefetchable window above 4GB where the 32-bit 01:00.0
device can't use it.

Clipping the 00:01.0 window is less intrusive than completely reassigning
things and is sufficient to let us use most of the BIOS configuration.  Of
course, it's possible that devices below 00:01.0 will no longer fit.  If
that's the case, we'll have to reassign things.  But that's a separate
problem.

[bhelgaas: changelog, split into separate patch]
Link: https://bugzilla.kernel.org/show_bug.cgi?id=85491
Reported-by: Marek Kordik <kordikmarek@gmail.com>
Fixes: 5b28541552ef ("PCI: Restrict 64-bit prefetchable bridge windows to 64-bit resources")
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
CC: stable@vger.kernel.org	# v3.16+
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 44627f1df4ca..9603094ed59b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1067,6 +1067,7 @@ resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx);
 void pci_bus_assign_resources(const struct pci_bus *bus);
 void pci_bus_size_bridges(struct pci_bus *bus);
 int pci_claim_resource(struct pci_dev *, int);
+int pci_claim_bridge_resource(struct pci_dev *bridge, int i);
 void pci_assign_unassigned_resources(void);
 void pci_assign_unassigned_bridge_resources(struct pci_dev *bridge);
 void pci_assign_unassigned_bus_resources(struct pci_bus *bus);
-- 
cgit v1.2.3


From ee1c244219fd652964710a6cc3e4f922e86aa492 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 16 Jan 2015 11:37:14 +0100
Subject: genetlink: synchronize socket closing and family removal

In addition to the problem Jeff Layton reported, I looked at the code
and reproduced the same warning by subscribing and removing the genl
family with a socket still open. This is a fairly tricky race which
originates in the fact that generic netlink allows the family to go
away while sockets are still open - unlike regular netlink which has
a module refcount for every open socket so in general this cannot be
triggered.

Trying to resolve this issue by the obvious locking isn't possible as
it will result in deadlocks between unregistration and group unbind
notification (which incidentally lockdep doesn't find due to the home
grown locking in the netlink table.)

To really resolve this, introduce a "closing socket" reference counter
(for generic netlink only, as it's the only affected family) in the
core netlink code and use that in generic netlink to wait for all the
sockets that are being closed at the same time as a generic netlink
family is removed.

This fixes the race that when a socket is closed, it will should call
the unbind, but if the family is removed at the same time the unbind
will not find it, leading to the warning. The real problem though is
that in this case the unbind could actually find a new family that is
registered to have a multicast group with the same ID, and call its
mcast_unbind() leading to confusing.

Also remove the warning since it would still trigger, but is now no
longer a problem.

This also moves the code in af_netlink.c to before unreferencing the
module to avoid having the same problem in the normal non-genl case.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/genetlink.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h
index 55b685719d52..09460d6d6682 100644
--- a/include/linux/genetlink.h
+++ b/include/linux/genetlink.h
@@ -11,6 +11,10 @@ extern void genl_unlock(void);
 extern int lockdep_genl_is_held(void);
 #endif
 
+/* for synchronisation between af_netlink and genetlink */
+extern atomic_t genl_sk_destructing_cnt;
+extern wait_queue_head_t genl_sk_destructing_waitq;
+
 /**
  * rcu_dereference_genl - rcu_dereference with debug checking
  * @p: The pointer to read, prior to dereferencing
-- 
cgit v1.2.3


From 72dd299d5039a336493993dcc63413cf31d0e662 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 16 Jan 2015 15:13:02 -0800
Subject: libata: allow sata_sil24 to opt-out of tag ordered submission

Ronny reports: https://bugzilla.kernel.org/show_bug.cgi?id=87101
    "Since commit 8a4aeec8d "libata/ahci: accommodate tag ordered
    controllers" the access to the harddisk on the first SATA-port is
    failing on its first access. The access to the harddisk on the
    second port is working normal.

    When reverting the above commit, access to both harddisks is working
    fine again."

Maintain tag ordered submission as the default, but allow sata_sil24 to
continue with the old behavior.

Cc: <stable@vger.kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Reported-by: Ronny Hegewald <Ronny.Hegewald@online.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/libata.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/libata.h b/include/linux/libata.h
index f2b440e44fd7..91f705de2c0b 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -231,6 +231,7 @@ enum {
 	ATA_FLAG_SW_ACTIVITY	= (1 << 22), /* driver supports sw activity
 					      * led */
 	ATA_FLAG_NO_DIPM	= (1 << 23), /* host not happy with DIPM */
+	ATA_FLAG_LOWTAG		= (1 << 24), /* host wants lowest available tag */
 
 	/* bits 24:31 of ap->flags are reserved for LLD specific flags */
 
-- 
cgit v1.2.3


From d453cded05ee219b77815ea194dc36efa5398bca Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 20 Jan 2015 09:07:04 +1030
Subject: module_arch_freeing_init(): new hook for archs before
 module->module_init freed.

Archs have been abusing module_free() to clean up their arch-specific
allocations.  Since module_free() is also (ab)used by BPF and trace code,
let's keep it to simple allocations, and provide a hook called before
that.

This means that avr32, ia64, parisc and s390 no longer need to implement
their own module_free() at all.  avr32 doesn't need module_finalize()
either.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-ia64@vger.kernel.org
Cc: linux-parisc@vger.kernel.org
Cc: linux-s390@vger.kernel.org
---
 include/linux/moduleloader.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 7eeb9bbfb816..054eac853090 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -82,4 +82,6 @@ int module_finalize(const Elf_Ehdr *hdr,
 /* Any cleanup needed when module leaves. */
 void module_arch_cleanup(struct module *mod);
 
+/* Any cleanup before freeing mod->module_init */
+void module_arch_freeing_init(struct module *mod);
 #endif
-- 
cgit v1.2.3


From be1f221c0445a4157d177197c236f888d3581914 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 20 Jan 2015 09:07:05 +1030
Subject: module: remove mod arg from module_free, rename module_memfree().

Nothing needs the module pointer any more, and the next patch will
call it from RCU, where the module itself might no longer exist.
Removing the arg is the safest approach.

This just codifies the use of the module_alloc/module_free pattern
which ftrace and bpf use.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: x86@kernel.org
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: linux-cris-kernel@axis.com
Cc: linux-kernel@vger.kernel.org
Cc: linux-mips@linux-mips.org
Cc: nios2-dev@lists.rocketboards.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: sparclinux@vger.kernel.org
Cc: netdev@vger.kernel.org
---
 include/linux/moduleloader.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 054eac853090..f7556261fe3c 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -26,7 +26,7 @@ unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section);
 void *module_alloc(unsigned long size);
 
 /* Free memory returned from module_alloc. */
-void module_free(struct module *mod, void *module_region);
+void module_memfree(void *module_region);
 
 /*
  * Apply the given relocation to the (simplified) ELF.  Return -error
-- 
cgit v1.2.3


From d5db139ab3764640e0882a1746e7b9fdee33fd87 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 22 Jan 2015 11:13:14 +1030
Subject: module: make module_refcount() a signed integer.

James Bottomley points out that it will be -1 during unload.  It's
only used for diagnostics, so let's not hide that as it could be a
clue as to what's gone wrong.

Cc: Jason Wessel <jason.wessel@windriver.com>
Acked-and-documention-added-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Masami Hiramatsu <maasami.hiramatsu.pt@hitachi.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index ebfb0e153c6a..b653d7c0a05a 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -444,7 +444,7 @@ extern void __module_put_and_exit(struct module *mod, long code)
 #define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code)
 
 #ifdef CONFIG_MODULE_UNLOAD
-unsigned long module_refcount(struct module *mod);
+int module_refcount(struct module *mod);
 void __symbol_put(const char *symbol);
 #define symbol_put(x) __symbol_put(VMLINUX_SYMBOL_STR(x))
 void symbol_put_addr(void *addr);
-- 
cgit v1.2.3


From d5fd120e7860c2b3d4c936a2ebadb6b244bec4c8 Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Mon, 26 Jan 2015 20:59:31 +0100
Subject: i2c: Only include slave support if selected

Make the slave support depend on CONFIG_I2C_SLAVE. Otherwise it gets
included unconditionally, even when it is not needed.

I2C bus drivers which implement slave support must select
I2C_SLAVE.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 include/linux/i2c.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index e3a1721c8354..7c7695940ddd 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -228,7 +228,9 @@ struct i2c_client {
 	struct device dev;		/* the device structure		*/
 	int irq;			/* irq issued by device		*/
 	struct list_head detected;
+#if IS_ENABLED(CONFIG_I2C_SLAVE)
 	i2c_slave_cb_t slave_cb;	/* callback for slave mode	*/
+#endif
 };
 #define to_i2c_client(d) container_of(d, struct i2c_client, dev)
 
@@ -253,6 +255,7 @@ static inline void i2c_set_clientdata(struct i2c_client *dev, void *data)
 
 /* I2C slave support */
 
+#if IS_ENABLED(CONFIG_I2C_SLAVE)
 enum i2c_slave_event {
 	I2C_SLAVE_REQ_READ_START,
 	I2C_SLAVE_REQ_READ_END,
@@ -269,6 +272,7 @@ static inline int i2c_slave_event(struct i2c_client *client,
 {
 	return client->slave_cb(client, event, val);
 }
+#endif
 
 /**
  * struct i2c_board_info - template for device creation
@@ -404,8 +408,10 @@ struct i2c_algorithm {
 	/* To determine what the adapter supports */
 	u32 (*functionality) (struct i2c_adapter *);
 
+#if IS_ENABLED(CONFIG_I2C_SLAVE)
 	int (*reg_slave)(struct i2c_client *client);
 	int (*unreg_slave)(struct i2c_client *client);
+#endif
 };
 
 /**
-- 
cgit v1.2.3


From 9879de7373fcfb466ec198293b6ccc1ad7a42dd8 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 26 Jan 2015 12:58:32 -0800
Subject: mm: page_alloc: embed OOM killing naturally into allocation slowpath

The OOM killing invocation does a lot of duplicative checks against the
task's allocation context.  Rework it to take advantage of the existing
checks in the allocator slowpath.

The OOM killer is invoked when the allocator is unable to reclaim any
pages but the allocation has to keep looping.  Instead of having a check
for __GFP_NORETRY hidden in oom_gfp_allowed(), just move the OOM
invocation to the true branch of should_alloc_retry().  The __GFP_FS
check from oom_gfp_allowed() can then be moved into the OOM avoidance
branch in __alloc_pages_may_oom(), along with the PF_DUMPCORE test.

__alloc_pages_may_oom() can then signal to the caller whether the OOM
killer was invoked, instead of requiring it to duplicate the order and
high_zoneidx checks to guess this when deciding whether to continue.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 853698c721f7..76200984d1e2 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -85,11 +85,6 @@ static inline void oom_killer_enable(void)
 	oom_killer_disabled = false;
 }
 
-static inline bool oom_gfp_allowed(gfp_t gfp_mask)
-{
-	return (gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY);
-}
-
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
 static inline bool task_will_free_mem(struct task_struct *task)
-- 
cgit v1.2.3


From 07261edb971492c6b41b44d7b1b51f76807d30ad Mon Sep 17 00:00:00 2001
From: Pranith Kumar <bobby.prani@gmail.com>
Date: Mon, 26 Jan 2015 12:58:43 -0800
Subject: printk: add dummy routine for when CONFIG_PRINTK=n

There are missing dummy routines for log_buf_addr_get() and
log_buf_len_get() for when CONFIG_PRINTK is not set causing build
failures.

This patch adds these dummy routines at the appropriate location.

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Petr Mladek <pmladek@suse.cz>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index c8f170324e64..4d5bf5726578 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -10,9 +10,6 @@
 extern const char linux_banner[];
 extern const char linux_proc_banner[];
 
-extern char *log_buf_addr_get(void);
-extern u32 log_buf_len_get(void);
-
 static inline int printk_get_level(const char *buffer)
 {
 	if (buffer[0] == KERN_SOH_ASCII && buffer[1]) {
@@ -163,6 +160,8 @@ extern int kptr_restrict;
 
 extern void wake_up_klogd(void);
 
+char *log_buf_addr_get(void);
+u32 log_buf_len_get(void);
 void log_buf_kexec_setup(void);
 void __init setup_log_buf(int early);
 void dump_stack_set_arch_desc(const char *fmt, ...);
@@ -198,6 +197,16 @@ static inline void wake_up_klogd(void)
 {
 }
 
+static inline char *log_buf_addr_get(void)
+{
+	return NULL;
+}
+
+static inline u32 log_buf_len_get(void)
+{
+	return 0;
+}
+
 static inline void log_buf_kexec_setup(void)
 {
 }
-- 
cgit v1.2.3


From 14bf61ffe6ac54afcd1e888a4407fe16054483db Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 9 Oct 2014 16:03:13 +0200
Subject: quota: Switch ->get_dqblk() and ->set_dqblk() to use bytes as space
 units

Currently ->get_dqblk() and ->set_dqblk() use struct fs_disk_quota which
tracks space limits and usage in 512-byte blocks. However VFS quotas
track usage in bytes (as some filesystems require that) and we need to
somehow pass this information. Upto now it wasn't a problem because we
didn't do any unit conversion (thus VFS quota routines happily stuck
number of bytes into d_bcount field of struct fd_disk_quota). Only if
you tried to use Q_XGETQUOTA or Q_XSETQLIM for VFS quotas (or Q_GETQUOTA
/ Q_SETQUOTA for XFS quotas), you got bogus results. Hardly anyone
tried this but reportedly some Samba users hit the problem in practice.
So when we want interfaces compatible we need to fix this.

We bite the bullet and define another quota structure used for passing
information from/to ->get_dqblk()/->set_dqblk. It's somewhat sad we have
to have more conversion routines in fs/quota/quota.c and another copying
of quota structure slows down getting of quota information by about 2%
but it seems cleaner than overloading e.g. units of d_bcount to bytes.

CC: stable@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/quota.h    | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/quotaops.h |  4 ++--
 2 files changed, 47 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/quota.h b/include/linux/quota.h
index 50978b781a19..097d7eb2441e 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -321,6 +321,49 @@ struct dquot_operations {
 
 struct path;
 
+/* Structure for communicating via ->get_dqblk() & ->set_dqblk() */
+struct qc_dqblk {
+	int d_fieldmask;	/* mask of fields to change in ->set_dqblk() */
+	u64 d_spc_hardlimit;	/* absolute limit on used space */
+	u64 d_spc_softlimit;	/* preferred limit on used space */
+	u64 d_ino_hardlimit;	/* maximum # allocated inodes */
+	u64 d_ino_softlimit;	/* preferred inode limit */
+	u64 d_space;		/* Space owned by the user */
+	u64 d_ino_count;	/* # inodes owned by the user */
+	s64 d_ino_timer;	/* zero if within inode limits */
+				/* if not, we refuse service */
+	s64 d_spc_timer;	/* similar to above; for space */
+	int d_ino_warns;	/* # warnings issued wrt num inodes */
+	int d_spc_warns;	/* # warnings issued wrt used space */
+	u64 d_rt_spc_hardlimit;	/* absolute limit on realtime space */
+	u64 d_rt_spc_softlimit;	/* preferred limit on RT space */
+	u64 d_rt_space;		/* realtime space owned */
+	s64 d_rt_spc_timer;	/* similar to above; for RT space */
+	int d_rt_spc_warns;	/* # warnings issued wrt RT space */
+};
+
+/* Field specifiers for ->set_dqblk() in struct qc_dqblk */
+#define	QC_INO_SOFT	(1<<0)
+#define	QC_INO_HARD	(1<<1)
+#define	QC_SPC_SOFT	(1<<2)
+#define	QC_SPC_HARD	(1<<3)
+#define	QC_RT_SPC_SOFT	(1<<4)
+#define	QC_RT_SPC_HARD	(1<<5)
+#define QC_LIMIT_MASK (QC_INO_SOFT | QC_INO_HARD | QC_SPC_SOFT | QC_SPC_HARD | \
+		       QC_RT_SPC_SOFT | QC_RT_SPC_HARD)
+#define	QC_SPC_TIMER	(1<<6)
+#define	QC_INO_TIMER	(1<<7)
+#define	QC_RT_SPC_TIMER	(1<<8)
+#define QC_TIMER_MASK (QC_SPC_TIMER | QC_INO_TIMER | QC_RT_SPC_TIMER)
+#define	QC_SPC_WARNS	(1<<9)
+#define	QC_INO_WARNS	(1<<10)
+#define	QC_RT_SPC_WARNS	(1<<11)
+#define QC_WARNS_MASK (QC_SPC_WARNS | QC_INO_WARNS | QC_RT_SPC_WARNS)
+#define	QC_SPACE	(1<<12)
+#define	QC_INO_COUNT	(1<<13)
+#define	QC_RT_SPACE	(1<<14)
+#define QC_ACCT_MASK (QC_SPACE | QC_INO_COUNT | QC_RT_SPACE)
+
 /* Operations handling requests from userspace */
 struct quotactl_ops {
 	int (*quota_on)(struct super_block *, int, int, struct path *);
@@ -329,8 +372,8 @@ struct quotactl_ops {
 	int (*quota_sync)(struct super_block *, int);
 	int (*get_info)(struct super_block *, int, struct if_dqinfo *);
 	int (*set_info)(struct super_block *, int, struct if_dqinfo *);
-	int (*get_dqblk)(struct super_block *, struct kqid, struct fs_disk_quota *);
-	int (*set_dqblk)(struct super_block *, struct kqid, struct fs_disk_quota *);
+	int (*get_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
+	int (*set_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
 	int (*get_xstate)(struct super_block *, struct fs_quota_stat *);
 	int (*set_xstate)(struct super_block *, unsigned int, int);
 	int (*get_xstatev)(struct super_block *, struct fs_quota_statv *);
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index f23538a6e411..29e3455f7d41 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -98,9 +98,9 @@ int dquot_quota_sync(struct super_block *sb, int type);
 int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii);
 int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii);
 int dquot_get_dqblk(struct super_block *sb, struct kqid id,
-		struct fs_disk_quota *di);
+		struct qc_dqblk *di);
 int dquot_set_dqblk(struct super_block *sb, struct kqid id,
-		struct fs_disk_quota *di);
+		struct qc_dqblk *di);
 
 int __dquot_transfer(struct inode *inode, struct dquot **transfer_to);
 int dquot_transfer(struct inode *inode, struct iattr *iattr);
-- 
cgit v1.2.3


From c3c87e770458aa004bd7ed3f29945ff436fd6511 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 23 Jan 2015 11:19:48 +0100
Subject: perf: Tighten (and fix) the grouping condition

The fix from 9fc81d87420d ("perf: Fix events installation during
moving group") was incomplete in that it failed to recognise that
creating a group with events for different CPUs is semantically
broken -- they cannot be co-scheduled.

Furthermore, it leads to real breakage where, when we create an event
for CPU Y and then migrate it to form a group on CPU X, the code gets
confused where the counter is programmed -- triggered in practice
as well by me via the perf fuzzer.

Fix this by tightening the rules for creating groups. Only allow
grouping of counters that can be co-scheduled in the same context.
This means for the same task and/or the same cpu.

Fixes: 9fc81d87420d ("perf: Fix events installation during moving group")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20150123125834.090683288@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4f7a61ca4b39..664de5a4ec46 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -450,11 +450,6 @@ struct perf_event {
 #endif /* CONFIG_PERF_EVENTS */
 };
 
-enum perf_event_context_type {
-	task_context,
-	cpu_context,
-};
-
 /**
  * struct perf_event_context - event context structure
  *
@@ -462,7 +457,6 @@ enum perf_event_context_type {
  */
 struct perf_event_context {
 	struct pmu			*pmu;
-	enum perf_event_context_type	type;
 	/*
 	 * Protect the states of the events in the list,
 	 * nr_active, and the list:
-- 
cgit v1.2.3


From 33692f27597fcab536d7cbbcc8f52905133e4aa7 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 29 Jan 2015 10:51:32 -0800
Subject: vm: add VM_FAULT_SIGSEGV handling support

The core VM already knows about VM_FAULT_SIGBUS, but cannot return a
"you should SIGSEGV" error, because the SIGSEGV case was generally
handled by the caller - usually the architecture fault handler.

That results in lots of duplication - all the architecture fault
handlers end up doing very similar "look up vma, check permissions, do
retries etc" - but it generally works.  However, there are cases where
the VM actually wants to SIGSEGV, and applications _expect_ SIGSEGV.

In particular, when accessing the stack guard page, libsigsegv expects a
SIGSEGV.  And it usually got one, because the stack growth is handled by
that duplicated architecture fault handler.

However, when the generic VM layer started propagating the error return
from the stack expansion in commit fee7e49d4514 ("mm: propagate error
from stack expansion even for guard page"), that now exposed the
existing VM_FAULT_SIGBUS result to user space.  And user space really
expected SIGSEGV, not SIGBUS.

To fix that case, we need to add a VM_FAULT_SIGSEGV, and teach all those
duplicate architecture fault handlers about it.  They all already have
the code to handle SIGSEGV, so it's about just tying that new return
value to the existing code, but it's all a bit annoying.

This is the mindless minimal patch to do this.  A more extensive patch
would be to try to gather up the mostly shared fault handling logic into
one generic helper routine, and long-term we really should do that
cleanup.

Just from this patch, you can generally see that most architectures just
copied (directly or indirectly) the old x86 way of doing things, but in
the meantime that original x86 model has been improved to hold the VM
semaphore for shorter times etc and to handle VM_FAULT_RETRY and other
"newer" things, so it would be a good idea to bring all those
improvements to the generic case and teach other architectures about
them too.

Reported-and-tested-by: Takashi Iwai <tiwai@suse.de>
Tested-by: Jan Engelhardt <jengelh@inai.de>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com> # "s390 still compiles and boots"
Cc: linux-arch@vger.kernel.org
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80fc92a49649..dd5ea3016fc4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1070,6 +1070,7 @@ static inline int page_mapped(struct page *page)
 #define VM_FAULT_WRITE	0x0008	/* Special case for get_user_pages */
 #define VM_FAULT_HWPOISON 0x0010	/* Hit poisoned small page */
 #define VM_FAULT_HWPOISON_LARGE 0x0020  /* Hit poisoned large page. Index encoded in upper bits */
+#define VM_FAULT_SIGSEGV 0x0040
 
 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
@@ -1078,8 +1079,9 @@ static inline int page_mapped(struct page *page)
 
 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
 
-#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
-			 VM_FAULT_FALLBACK | VM_FAULT_HWPOISON_LARGE)
+#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
+			 VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
+			 VM_FAULT_FALLBACK)
 
 /* Encode hstate index for a hwpoisoned large page */
 #define VM_FAULT_SET_HINDEX(x) ((x) << 12)
-- 
cgit v1.2.3


From 00845eb968ead28007338b2bb852b8beef816583 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 1 Feb 2015 12:23:32 -0800
Subject: sched: don't cause task state changes in nested sleep debugging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 8eb23b9f35aa ("sched: Debug nested sleeps") added code to report
on nested sleep conditions, which we generally want to avoid because the
inner sleeping operation can re-set the thread state to TASK_RUNNING,
but that will then cause the outer sleep loop not actually sleep when it
calls schedule.

However, that's actually valid traditional behavior, with the inner
sleep being some fairly rare case (like taking a sleeping lock that
normally doesn't actually need to sleep).

And the debug code would actually change the state of the task to
TASK_RUNNING internally, which makes that kind of traditional and
working code not work at all, because now the nested sleep doesn't just
sometimes cause the outer one to not block, but will cause it to happen
every time.

In particular, it will cause the cardbus kernel daemon (pccardd) to
basically busy-loop doing scheduling, converting a laptop into a heater,
as reported by Bruno Prémont.  But there may be other legacy uses of
that nested sleep model in other drivers that are also likely to never
get converted to the new model.

This fixes both cases:

 - don't set TASK_RUNNING when the nested condition happens (note: even
   if WARN_ONCE() only _warns_ once, the return value isn't whether the
   warning happened, but whether the condition for the warning was true.
   So despite the warning only happening once, the "if (WARN_ON(..))"
   would trigger for every nested sleep.

 - in the cases where we knowingly disable the warning by using
   "sched_annotate_sleep()", don't change the task state (that is used
   for all core scheduling decisions), instead use '->task_state_change'
   that is used for the debugging decision itself.

(Credit for the second part of the fix goes to Oleg Nesterov: "Can't we
avoid this subtle change in behaviour DEBUG_ATOMIC_SLEEP adds?" with the
suggested change to use 'task_state_change' as part of the test)

Reported-and-bisected-by: Bruno Prémont <bonbons@linux-vserver.org>
Tested-by: Rafael J Wysocki <rjw@rjwysocki.net>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>,
Cc: Ilya Dryomov <ilya.dryomov@inktank.com>,
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Hurley <peter@hurleysoftware.com>,
Cc: Davidlohr Bueso <dave@stgolabs.net>,
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5449d2f4a1ef..64ce58bee6f5 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -176,7 +176,7 @@ extern int _cond_resched(void);
  */
 # define might_sleep() \
 	do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
-# define sched_annotate_sleep()	__set_current_state(TASK_RUNNING)
+# define sched_annotate_sleep()	(current->task_state_change = 0)
 #else
   static inline void ___might_sleep(const char *file, int line,
 				   int preempt_offset) { }
-- 
cgit v1.2.3