From cbee9f88ec1b8dd6b58f25f54e4f52c82ed77690 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:43 +0200 Subject: mm: numa: Add fault driven placement and migration NOTE: This patch is based on "sched, numa, mm: Add fault driven placement and migration policy" but as it throws away all the policy to just leave a basic foundation I had to drop the signed-offs-by. This patch creates a bare-bones method for setting PTEs pte_numa in the context of the scheduler that when faulted later will be faulted onto the node the CPU is running on. In itself this does nothing useful but any placement policy will fundamentally depend on receiving hints on placement from fault context and doing something intelligent about it. Signed-off-by: Mel Gorman Acked-by: Rik van Riel --- kernel/sysctl.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 26f65eaa01f9..025e1ae50ef1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +#ifdef CONFIG_SMP static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -#endif +#endif /* CONFIG_SMP */ +#endif /* CONFIG_SCHED_DEBUG */ #ifdef CONFIG_COMPACTION static int min_extfrag_threshold; @@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = { .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, }, +#ifdef CONFIG_SMP { .procname = "sched_tunable_scaling", .data = &sysctl_sched_tunable_scaling, @@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, -#endif +#endif /* CONFIG_SMP */ +#ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_scan_period_min_ms", + .data = &sysctl_numa_balancing_scan_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "numa_balancing_scan_period_max_ms", + .data = &sysctl_numa_balancing_scan_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_NUMA_BALANCING */ +#endif /* CONFIG_SCHED_DEBUG */ { .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, -- cgit v1.2.3 From 6e5fb223e89dbe5cb5c563f8d4a4a0a7d62455a8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:45 +0200 Subject: mm: sched: numa: Implement constant, per task Working Set Sampling (WSS) rate Previously, to probe the working set of a task, we'd use a very simple and crude method: mark all of its address space PROT_NONE. That method has various (obvious) disadvantages: - it samples the working set at dissimilar rates, giving some tasks a sampling quality advantage over others. - creates performance problems for tasks with very large working sets - over-samples processes with large address spaces but which only very rarely execute Improve that method by keeping a rotating offset into the address space that marks the current position of the scan, and advance it by a constant rate (in a CPU cycles execution proportional manner). If the offset reaches the last mapped address of the mm then it then it starts over at the first address. The per-task nature of the working set sampling functionality in this tree allows such constant rate, per task, execution-weight proportional sampling of the working set, with an adaptive sampling interval/frequency that goes from once per 100ms up to just once per 8 seconds. The current sampling volume is 256 MB per interval. As tasks mature and converge their working set, so does the sampling rate slow down to just a trickle, 256 MB per 8 seconds of CPU time executed. This, beyond being adaptive, also rate-limits rarely executing systems and does not over-sample on overloaded systems. [ In AutoNUMA speak, this patch deals with the effective sampling rate of the 'hinting page fault'. AutoNUMA's scanning is currently rate-limited, but it is also fundamentally single-threaded, executing in the knuma_scand kernel thread, so the limit in AutoNUMA is global and does not scale up with the number of CPUs, nor does it scan tasks in an execution proportional manner. So the idea of rate-limiting the scanning was first implemented in the AutoNUMA tree via a global rate limit. This patch goes beyond that by implementing an execution rate proportional working set sampling rate that is not implemented via a single global scanning daemon. ] [ Dan Carpenter pointed out a possible NULL pointer dereference in the first version of this patch. ] Based-on-idea-by: Andrea Arcangeli Bug-Found-By: Dan Carpenter Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Rik van Riel [ Wrote changelog and fixed bug. ] Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- kernel/sysctl.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 025e1ae50ef1..7d3a2e0475e5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -366,6 +366,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "numa_balancing_scan_size_mb", + .data = &sysctl_numa_balancing_scan_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ { -- cgit v1.2.3 From 4b96a29ba891dd59734cb7be80a900fe93aa2d9f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Oct 2012 14:16:47 +0200 Subject: mm: sched: numa: Implement slow start for working set sampling Add a 1 second delay before starting to scan the working set of a task and starting to balance it amongst nodes. [ note that before the constant per task WSS sampling rate patch the initial scan would happen much later still, in effect that patch caused this regression. ] The theory is that short-run tasks benefit very little from NUMA placement: they come and go, and they better stick to the node they were started on. As tasks mature and rebalance to other CPUs and nodes, so does their NUMA placement have to change and so does it start to matter more and more. In practice this change fixes an observable kbuild regression: # [ a perf stat --null --repeat 10 test of ten bzImage builds to /dev/shm ] !NUMA: 45.291088843 seconds time elapsed ( +- 0.40% ) 45.154231752 seconds time elapsed ( +- 0.36% ) +NUMA, no slow start: 46.172308123 seconds time elapsed ( +- 0.30% ) 46.343168745 seconds time elapsed ( +- 0.25% ) +NUMA, 1 sec slow start: 45.224189155 seconds time elapsed ( +- 0.25% ) 45.160866532 seconds time elapsed ( +- 0.17% ) and it also fixes an observable perf bench (hackbench) regression: # perf stat --null --repeat 10 perf bench sched messaging -NUMA: -NUMA: 0.246225691 seconds time elapsed ( +- 1.31% ) +NUMA no slow start: 0.252620063 seconds time elapsed ( +- 1.13% ) +NUMA 1sec delay: 0.248076230 seconds time elapsed ( +- 1.35% ) The implementation is simple and straightforward, most of the patch deals with adding the /proc/sys/kernel/numa_balancing_scan_delay_ms tunable knob. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Rik van Riel [ Wrote the changelog, ran measurements, tuned the default. ] Signed-off-by: Ingo Molnar Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel --- kernel/sysctl.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7d3a2e0475e5..48a68cc258c1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -352,6 +352,13 @@ static struct ctl_table kern_table[] = { }, #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING + { + .procname = "numa_balancing_scan_delay_ms", + .data = &sysctl_numa_balancing_scan_delay, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "numa_balancing_scan_period_min_ms", .data = &sysctl_numa_balancing_scan_period_min, -- cgit v1.2.3 From b8593bfda1652755136333cdd362de125b283a9c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 21 Nov 2012 01:18:23 +0000 Subject: mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate The PTE scanning rate and fault rates are two of the biggest sources of system CPU overhead with automatic NUMA placement. Ideally a proper policy would detect if a workload was properly placed, schedule and adjust the PTE scanning rate accordingly. We do not track the necessary information to do that but we at least know if we migrated or not. This patch scans slower if a page was not migrated as the result of a NUMA hinting fault up to sysctl_numa_balancing_scan_period_max which is now higher than the previous default. Once every minute it will reset the scanner in case of phase changes. This is hilariously crude and the numbers are arbitrary. Workloads will converge quite slowly in comparison to what a proper policy should be able to do. On the plus side, we will chew up less CPU for workloads that have no need for automatic balancing. Signed-off-by: Mel Gorman --- kernel/sysctl.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 48a68cc258c1..8906f90d6fa2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -366,6 +366,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "numa_balancing_scan_period_reset", + .data = &sysctl_numa_balancing_scan_period_reset, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "numa_balancing_scan_period_max_ms", .data = &sysctl_numa_balancing_scan_period_max, -- cgit v1.2.3 From b6fca72536fb94098acb23c00b4e65d3bc4bb252 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 9 Jan 2013 20:06:28 +0530 Subject: sysctl: Enable IA64 "ignore-unaligned-usertrap" to be used cross-arch IA64 defines /proc/sys/kernel/ignore-unaligned-usertrap to control verbose warnings on unaligned access emulation. Although the exact mechanics of what to do with sysctl (ignore/shout) are arch specific, this change enables the sysctl to be usable cross-arch. Signed-off-by: Vineet Gupta Cc: Fenghua Yu Cc: "Eric W. Biederman" Cc: Serge Hallyn Signed-off-by: Tony Luck --- kernel/sysctl.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c88878db491e..840fd5eb2309 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -161,10 +161,13 @@ extern int unaligned_enabled; #endif #ifdef CONFIG_IA64 -extern int no_unaligned_warning; extern int unaligned_dump_stack; #endif +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN +extern int no_unaligned_warning; +#endif + #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -911,7 +914,7 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_doulongvec_minmax, }, #endif -#ifdef CONFIG_IA64 +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN { .procname = "ignore-unaligned-usertrap", .data = &no_unaligned_warning, @@ -919,6 +922,8 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#endif +#ifdef CONFIG_IA64 { .procname = "unaligned-dump-stack", .data = &unaligned_dump_stack, -- cgit v1.2.3 From 373d4d099761cb1f637bed488ab3871945882273 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 21 Jan 2013 17:17:39 +1030 Subject: taint: add explicit flag to show whether lock dep is still OK. Fix up all callers as they were before, with make one change: an unsigned module taints the kernel, but doesn't turn off lockdep. Signed-off-by: Rusty Russell --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c88878db491e..f97f9d75cde8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2006,7 +2006,7 @@ static int proc_taint(struct ctl_table *table, int write, int i; for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { if ((tmptaint >> i) & 1) - add_taint(i); + add_taint(i, LOCKDEP_STILL_OK); } } -- cgit v1.2.3 From cf4aebc292fac7f34f8345664320e9d4a42ca76c Mon Sep 17 00:00:00 2001 From: Clark Williams Date: Thu, 7 Feb 2013 09:46:59 -0600 Subject: sched: Move sched.h sysctl bits into separate header Move the sysctl-related bits from include/linux/sched.h into a new file: include/linux/sched/sysctl.h. Then update source files requiring access to those bits by including the new header file. Signed-off-by: Clark Williams Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20130207094659.06dced96@riff.lan Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c88878db491e..7357e23aaf68 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From ce0dbbbb30aee6a835511d5be446462388ba9eee Mon Sep 17 00:00:00 2001 From: Clark Williams Date: Thu, 7 Feb 2013 09:47:04 -0600 Subject: sched/rt: Add a tuning knob to allow changing SCHED_RR timeslice Add a /proc/sys/kernel scheduler knob named sched_rr_timeslice_ms that allows global changing of the SCHED_RR timeslice value. User visable value is in milliseconds but is stored as jiffies. Setting to 0 (zero) resets to the default (currently 100ms). Signed-off-by: Clark Williams Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20130207094704.13751796@riff.lan Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7357e23aaf68..4fc9be955c71 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -404,6 +404,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rt_handler, }, + { + .procname = "sched_rr_timeslice_ms", + .data = &sched_rr_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rr_handler, + }, #ifdef CONFIG_SCHED_AUTOGROUP { .procname = "sched_autogroup_enabled", -- cgit v1.2.3 From bf14e3b979a01cd7298d631736f965fe83c6e2bc Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Fri, 18 Jan 2013 15:12:24 +0530 Subject: sysctl: Enable PARISC "unaligned-trap" to be used cross-arch PARISC defines /proc/sys/kernel/unaligned-trap to runtime toggle unaligned access emulation. The exact mechanics of enablig/disabling are still arch specific, we can make the sysctl usable by other arches. Signed-off-by: Vineet Gupta Acked-by: Helge Deller Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: "Eric W. Biederman" Cc: Serge Hallyn --- kernel/sysctl.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c88878db491e..878b4c41cfb7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -157,6 +157,9 @@ extern int sysctl_tsb_ratio; #ifdef __hppa__ extern int pwrsw_enabled; +#endif + +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW extern int unaligned_enabled; #endif @@ -545,6 +548,8 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#endif +#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW { .procname = "unaligned-trap", .data = &unaligned_enabled, -- cgit v1.2.3 From 75f7ad8e043d9383337d917584297f7737154bbf Mon Sep 17 00:00:00 2001 From: Paul Szabo Date: Fri, 22 Feb 2013 16:34:42 -0800 Subject: page-writeback.c: subtract min_free_kbytes from dirtyable memory When calculating amount of dirtyable memory, min_free_kbytes should be subtracted because it is not intended for dirty pages. Addresses http://bugs.debian.org/695182 [akpm@linux-foundation.org: fix up min_free_kbytes extern declarations] [akpm@linux-foundation.org: fix min() warning] Signed-off-by: Paul Szabo Acked-by: Rik van Riel Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 467d8b923fcd..95e9e55602a8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -105,7 +105,6 @@ extern char core_pattern[]; extern unsigned int core_pipe_limit; #endif extern int pid_max; -extern int min_free_kbytes; extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; -- cgit v1.2.3 From e579d2c259be42b6f29458327e5153b22414b031 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 27 Feb 2013 17:03:15 -0800 Subject: coredump: remove redundant defines for dumpable states The existing SUID_DUMP_* defines duplicate the newer SUID_DUMPABLE_* defines introduced in 54b501992dd2 ("coredump: warn about unsafe suid_dumpable / core_pattern combo"). Remove the new ones, and use the prior values instead. Signed-off-by: Kees Cook Reported-by: Chen Gang Cc: Alexander Viro Cc: Alan Cox Cc: "Eric W. Biederman" Cc: Doug Ledford Cc: Serge Hallyn Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d8df00e69c14..d1b4ee67d2df 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2095,7 +2095,7 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, static void validate_coredump_safety(void) { #ifdef CONFIG_COREDUMP - if (suid_dumpable == SUID_DUMPABLE_SAFE && + if (suid_dumpable == SUID_DUMP_ROOT && core_pattern[0] != '/' && core_pattern[0] != '|') { printk(KERN_WARNING "Unsafe core_pattern used with "\ "suid_dumpable=2. Pipe handler or fully qualified "\ -- cgit v1.2.3