From 86d56134f1b67d0c18025ba5cade95c048ed528d Mon Sep 17 00:00:00 2001 From: Michael Marineau Date: Thu, 10 Apr 2014 14:09:31 -0700 Subject: kobject: Make support for uevent_helper optional. Support for uevent_helper, aka hotplug, is not required on many systems these days but it can still be enabled via sysfs or sysctl. Reported-by: Darren Shepherd Signed-off-by: Michael Marineau Signed-off-by: Greg Kroah-Hartman --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..bc966a8ffc3e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -643,7 +643,7 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif - +#ifdef CONFIG_UEVENT_HELPER { .procname = "hotplug", .data = &uevent_helper, @@ -651,7 +651,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dostring, }, - +#endif #ifdef CONFIG_CHR_DEV_SG { .procname = "sg-big-buff", -- cgit v1.2.3 From 3d7ee969bffcc984c8aeaffc6ac6816fd929ace1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 May 2014 12:19:32 -0700 Subject: x86, vdso: Clean up 32-bit vs 64-bit vdso params Rather than using 'vdso_enabled' and an awful #define, just call the parameters vdso32_enabled and vdso64_enabled. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/87913de56bdcbae3d93917938302fc369b05caee.1399317206.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- kernel/sysctl.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..420d77afa8fd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1418,8 +1418,13 @@ static struct ctl_table vm_table[] = { (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { .procname = "vdso_enabled", +#ifdef CONFIG_X86_32 + .data = &vdso32_enabled, + .maxlen = sizeof(vdso32_enabled), +#else .data = &vdso_enabled, .maxlen = sizeof(vdso_enabled), +#endif .mode = 0644, .proc_handler = proc_dointvec, .extra1 = &zero, -- cgit v1.2.3 From 122ff243f5f104194750ecbc76d5946dd1eec934 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 12 May 2014 16:04:53 -0700 Subject: ipv4: make ip_local_reserved_ports per netns ip_local_port_range is already per netns, so should ip_local_reserved_ports be. And since it is none by default we don't actually need it when we don't enable CONFIG_SYSCTL. By the way, rename inet_is_reserved_local_port() to inet_is_local_reserved_port() Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..e36ae4b15726 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2501,11 +2501,11 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, bool first = 1; size_t left = *lenp; unsigned long bitmap_len = table->maxlen; - unsigned long *bitmap = (unsigned long *) table->data; + unsigned long *bitmap = *(unsigned long **) table->data; unsigned long *tmp_bitmap = NULL; char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; - if (!bitmap_len || !left || (*ppos && !write)) { + if (!bitmap || !bitmap_len || !left || (*ppos && !write)) { *lenp = 0; return 0; } -- cgit v1.2.3 From 8c7260c0d9a6fa327546af724fb48375df493326 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Fri, 16 May 2014 23:26:02 +0200 Subject: sparc64: fix sparse warning in tsb.c Fix following warning: tsb.c:290:5: warning: symbol 'sysctl_tsb_ratio' was not declared. Should it be static? Add extern declaration in asm/setup.h and remove local declaration in kernel/sysctl.c Signed-off-by: Sam Ravnborg Signed-off-by: David S. Miller --- kernel/sysctl.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..3c202d00f6e8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -152,10 +152,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #ifdef CONFIG_SPARC #endif -#ifdef CONFIG_SPARC64 -extern int sysctl_tsb_ratio; -#endif - #ifdef __hppa__ extern int pwrsw_enabled; #endif -- cgit v1.2.3 From f88083005ab319abba5d0b2e4e997558245493c8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 Jun 2014 14:37:17 -0700 Subject: sysctl: clean up char buffer arguments When writing to a sysctl string, each write, regardless of VFS position, began writing the string from the start. This meant the contents of the last write to the sysctl controlled the string contents instead of the first. This misbehavior was featured in an exploit against Chrome OS. While it's not in itself a vulnerability, it's a weirdness that isn't on the mind of most auditors: "This filter looks correct, the first line written would not be meaningful to sysctl" doesn't apply here, since the size of the write and the contents of the final write are what matter when writing to sysctls. This adds the sysctl kernel.sysctl_writes_strict to control the write behavior. The default (0) reports when VFS position is non-0 on a write, but retains legacy behavior, -1 disables the warning, and 1 enables the position-respecting behavior. The long-term plan here is to wait for userspace to be fixed in response to the new warning and to then switch the default kernel behavior to the new position-respecting behavior. This patch (of 4): The char buffer arguments are needlessly cast in weird places. Clean it up so things are easier to read. Signed-off-by: Kees Cook Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 40ce2d983b12..3e214beabbe9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1703,8 +1703,8 @@ int __init sysctl_init(void) #ifdef CONFIG_PROC_SYSCTL -static int _proc_do_string(void* data, int maxlen, int write, - void __user *buffer, +static int _proc_do_string(char *data, int maxlen, int write, + char __user *buffer, size_t *lenp, loff_t *ppos) { size_t len; @@ -1730,7 +1730,7 @@ static int _proc_do_string(void* data, int maxlen, int write, len = maxlen-1; if(copy_from_user(data, buffer, len)) return -EFAULT; - ((char *) data)[len] = 0; + data[len] = 0; *ppos += *lenp; } else { len = strlen(data); @@ -1748,10 +1748,10 @@ static int _proc_do_string(void* data, int maxlen, int write, if (len > *lenp) len = *lenp; if (len) - if(copy_to_user(buffer, data, len)) + if (copy_to_user(buffer, data, len)) return -EFAULT; if (len < *lenp) { - if(put_user('\n', ((char __user *) buffer) + len)) + if (put_user('\n', buffer + len)) return -EFAULT; len++; } @@ -1781,8 +1781,8 @@ static int _proc_do_string(void* data, int maxlen, int write, int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return _proc_do_string(table->data, table->maxlen, write, - buffer, lenp, ppos); + return _proc_do_string((char *)(table->data), table->maxlen, write, + (char __user *)buffer, lenp, ppos); } static size_t proc_skip_spaces(char **buf) -- cgit v1.2.3 From 2ca9bb456ada8bcbdc8f77f8fc78207653bbaa92 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 Jun 2014 14:37:18 -0700 Subject: sysctl: refactor sysctl string writing logic Consolidate buffer length checking with new-line/end-of-line checking. Additionally, instead of reading user memory twice, just do the assignment during the loop. This change doesn't affect the potential races here. It was already possible to read a sysctl that was in the middle of a write. In both cases, the string will always be NULL terminated. The pre-existing race remains a problem to be solved. Signed-off-by: Kees Cook Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3e214beabbe9..ac6847feaa83 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1717,21 +1717,18 @@ static int _proc_do_string(char *data, int maxlen, int write, } if (write) { + /* Start writing from beginning of buffer. */ len = 0; + *ppos += *lenp; p = buffer; - while (len < *lenp) { + while ((p - buffer) < *lenp && len < maxlen - 1) { if (get_user(c, p++)) return -EFAULT; if (c == 0 || c == '\n') break; - len++; + data[len++] = c; } - if (len >= maxlen) - len = maxlen-1; - if(copy_from_user(data, buffer, len)) - return -EFAULT; data[len] = 0; - *ppos += *lenp; } else { len = strlen(data); if (len > maxlen) -- cgit v1.2.3 From f4aacea2f5d1a5f7e3154e967d70cf3f711bcd61 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 Jun 2014 14:37:19 -0700 Subject: sysctl: allow for strict write position handling When writing to a sysctl string, each write, regardless of VFS position, begins writing the string from the start. This means the contents of the last write to the sysctl controls the string contents instead of the first: open("/proc/sys/kernel/modprobe", O_WRONLY) = 1 write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 4096) = 4096 write(1, "/bin/true", 9) = 9 close(1) = 0 $ cat /proc/sys/kernel/modprobe /bin/true Expected behaviour would be to have the sysctl be "AAAA..." capped at maxlen (in this case KMOD_PATH_LEN: 256), instead of truncating to the contents of the second write. Similarly, multiple short writes would not append to the sysctl. The old behavior is unlike regular POSIX files enough that doing audits of software that interact with sysctls can end up in unexpected or dangerous situations. For example, "as long as the input starts with a trusted path" turns out to be an insufficient filter, as what must also happen is for the input to be entirely contained in a single write syscall -- not a common consideration, especially for high level tools. This provides kernel.sysctl_writes_strict as a way to make this behavior act in a less surprising manner for strings, and disallows non-zero file position when writing numeric sysctls (similar to what is already done when reading from non-zero file positions). For now, the default (0) is to warn about non-zero file position use, but retain the legacy behavior. Setting this to -1 disables the warning, and setting this to 1 enables the file position respecting behavior. [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: move misplaced hunk, per Randy] Signed-off-by: Kees Cook Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ac6847feaa83..7a910b9081e8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -173,6 +173,13 @@ extern int no_unaligned_warning; #endif #ifdef CONFIG_PROC_SYSCTL + +#define SYSCTL_WRITES_LEGACY -1 +#define SYSCTL_WRITES_WARN 0 +#define SYSCTL_WRITES_STRICT 1 + +static int sysctl_writes_strict = SYSCTL_WRITES_WARN; + static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); static int proc_taint(struct ctl_table *table, int write, @@ -495,6 +502,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_taint, }, + { + .procname = "sysctl_writes_strict", + .data = &sysctl_writes_strict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &neg_one, + .extra2 = &one, + }, #endif #ifdef CONFIG_LATENCYTOP { @@ -1717,8 +1733,20 @@ static int _proc_do_string(char *data, int maxlen, int write, } if (write) { - /* Start writing from beginning of buffer. */ - len = 0; + if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { + /* Only continue writes not past the end of buffer. */ + len = strlen(data); + if (len > maxlen - 1) + len = maxlen - 1; + + if (*ppos > len) + return 0; + len = *ppos; + } else { + /* Start writing from beginning of buffer. */ + len = 0; + } + *ppos += *lenp; p = buffer; while ((p - buffer) < *lenp && len < maxlen - 1) { @@ -1758,6 +1786,14 @@ static int _proc_do_string(char *data, int maxlen, int write, return 0; } +static void warn_sysctl_write(struct ctl_table *table) +{ + pr_warn_once("%s wrote to %s when file position was not 0!\n" + "This will not be supported in the future. To silence this\n" + "warning, set kernel.sysctl_writes_strict = -1\n", + current->comm, table->procname); +} + /** * proc_dostring - read a string sysctl * @table: the sysctl table @@ -1778,6 +1814,9 @@ static int _proc_do_string(char *data, int maxlen, int write, int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) + warn_sysctl_write(table); + return _proc_do_string((char *)(table->data), table->maxlen, write, (char __user *)buffer, lenp, ppos); } @@ -1953,6 +1992,18 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, conv = do_proc_dointvec_conv; if (write) { + if (*ppos) { + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + goto out; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + break; + default: + break; + } + } + if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; page = __get_free_page(GFP_TEMPORARY); @@ -2010,6 +2061,7 @@ free: return err ? : -EINVAL; } *lenp -= left; +out: *ppos += *lenp; return err; } @@ -2202,6 +2254,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int left = *lenp; if (write) { + if (*ppos) { + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + goto out; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + break; + default: + break; + } + } + if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; page = __get_free_page(GFP_TEMPORARY); @@ -2257,6 +2321,7 @@ free: return err ? : -EINVAL; } *lenp -= left; +out: *ppos += *lenp; return err; } -- cgit v1.2.3 From 6f8fd1d77e64dd6be1075041cdfd8de4c2ca4e2b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 6 Jun 2014 14:38:08 -0700 Subject: sysctl: convert use of typedef ctl_table to struct ctl_table This typedef is unnecessary and should just be removed. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7a910b9081e8..db19e3e2aa4b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -202,7 +202,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, /* Note: sysrq code uses it's own private copy */ static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; -static int sysrq_sysctl_handler(ctl_table *table, int write, +static int sysrq_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { -- cgit v1.2.3 From 7cd2b0a34ab8e4db971920eef8982f985441adfb Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 23 Jun 2014 13:22:04 -0700 Subject: mm, pcp: allow restoring percpu_pagelist_fraction default Oleg reports a division by zero error on zero-length write() to the percpu_pagelist_fraction sysctl: divide error: 0000 [#1] SMP DEBUG_PAGEALLOC CPU: 1 PID: 9142 Comm: badarea_io Not tainted 3.15.0-rc2-vm-nfs+ #19 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff8800d5aeb6e0 ti: ffff8800d87a2000 task.ti: ffff8800d87a2000 RIP: 0010: percpu_pagelist_fraction_sysctl_handler+0x84/0x120 RSP: 0018:ffff8800d87a3e78 EFLAGS: 00010246 RAX: 0000000000000f89 RBX: ffff88011f7fd000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000010 RBP: ffff8800d87a3e98 R08: ffffffff81d002c8 R09: ffff8800d87a3f50 R10: 000000000000000b R11: 0000000000000246 R12: 0000000000000060 R13: ffffffff81c3c3e0 R14: ffffffff81cfddf8 R15: ffff8801193b0800 FS: 00007f614f1e9740(0000) GS:ffff88011f440000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f614f1fa000 CR3: 00000000d9291000 CR4: 00000000000006e0 Call Trace: proc_sys_call_handler+0xb3/0xc0 proc_sys_write+0x14/0x20 vfs_write+0xba/0x1e0 SyS_write+0x46/0xb0 tracesys+0xe1/0xe6 However, if the percpu_pagelist_fraction sysctl is set by the user, it is also impossible to restore it to the kernel default since the user cannot write 0 to the sysctl. This patch allows the user to write 0 to restore the default behavior. It still requires a fraction equal to or larger than 8, however, as stated by the documentation for sanity. If a value in the range [1, 7] is written, the sysctl will return EINVAL. This successfully solves the divide by zero issue at the same time. Signed-off-by: David Rientjes Reported-by: Oleg Drokin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7de6555cfea0..075d1903138f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; static int minolduid; -static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -1317,7 +1316,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(percpu_pagelist_fraction), .mode = 0644, .proc_handler = percpu_pagelist_fraction_sysctl_handler, - .extra1 = &min_percpu_pagelist_fract, + .extra1 = &zero, }, #ifdef CONFIG_MMU { -- cgit v1.2.3 From ed235875e2ca983197831337a986f0517074e1a0 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Mon, 23 Jun 2014 13:22:05 -0700 Subject: kernel/watchdog.c: print traces for all cpus on lockup detection A 'softlockup' is defined as a bug that causes the kernel to loop in kernel mode for more than a predefined period to time, without giving other tasks a chance to run. Currently, upon detection of this condition by the per-cpu watchdog task, debug information (including a stack trace) is sent to the system log. On some occasions, we have observed that the "victim" rather than the actual "culprit" (i.e. the owner/holder of the contended resource) is reported to the user. Often this information has proven to be insufficient to assist debugging efforts. To avoid loss of useful debug information, for architectures which support NMI, this patch makes it possible to improve soft lockup reporting. This is accomplished by issuing an NMI to each cpu to obtain a stack trace. If NMI is not supported we just revert back to the old method. A sysctl and boot-time parameter is available to toggle this feature. [dzickus@redhat.com: add CONFIG_SMP in certain areas] [akpm@linux-foundation.org: additional CONFIG_SMP=n optimisations] [mq@suse.cz: fix warning] Signed-off-by: Aaron Tomlin Signed-off-by: Don Zickus Cc: David S. Miller Cc: Mateusz Guzik Cc: Oleg Nesterov Signed-off-by: Jan Moskyto Matejka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 075d1903138f..75b22e22a72c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -860,6 +860,17 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_SMP + { + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif /* CONFIG_SMP */ { .procname = "nmi_watchdog", .data = &watchdog_user_enabled, -- cgit v1.2.3