From 775f4b297b780601e61787b766f306ed3e1d23eb Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 2 Jul 2012 07:52:16 -0400 Subject: random: make 'add_interrupt_randomness()' do something sane We've been moving away from add_interrupt_randomness() for various reasons: it's too expensive to do on every interrupt, and flooding the CPU with interrupts could theoretically cause bogus floods of entropy from a somewhat externally controllable source. This solves both problems by limiting the actual randomness addition to just once a second or after 64 interrupts, whicever comes first. During that time, the interrupt cycle data is buffered up in a per-cpu pool. Also, we make sure the the nonblocking pool used by urandom is initialized before we start feeding the normal input pool. This assures that /dev/urandom is returning unpredictable data as soon as possible. (Based on an original patch by Linus, but significantly modified by tytso.) Tested-by: Eric Wustrow Reported-by: Eric Wustrow Reported-by: Nadia Heninger Reported-by: Zakir Durumeric Reported-by: J. Alex Halderman . Signed-off-by: Linus Torvalds Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- kernel/irq/handle.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index bdb180325551..131ca176b497 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -133,7 +133,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { irqreturn_t retval = IRQ_NONE; - unsigned int random = 0, irq = desc->irq_data.irq; + unsigned int flags = 0, irq = desc->irq_data.irq; do { irqreturn_t res; @@ -161,7 +161,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) /* Fall through to add to randomness */ case IRQ_HANDLED: - random |= action->flags; + flags |= action->flags; break; default: @@ -172,8 +172,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) action = action->next; } while (action); - if (random & IRQF_SAMPLE_RANDOM) - add_interrupt_randomness(irq); + add_interrupt_randomness(irq, flags); if (!noirqdebug) note_interrupt(irq, desc, retval); -- cgit v1.2.3 From 70498253186586e5dca7bc3ebd3415203b059fbc Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Mon, 16 Jul 2012 18:35:29 -0700 Subject: kmsg - properly print over-long continuation lines Reserve PREFIX_MAX bytes in the LOG_LINE_MAX line when buffering a continuation line, to be able to properly prefix the LOG_LINE_MAX line with the syslog prefix and timestamp when printing it. Reported-By: Dave Jones Signed-off-by: Kay Sievers Cc: stable Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 177fa49357a5..d87ca5c6a989 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -235,7 +235,8 @@ static u32 log_next_idx; static u64 clear_seq; static u32 clear_idx; -#define LOG_LINE_MAX 1024 +#define PREFIX_MAX 32 +#define LOG_LINE_MAX 1024 - PREFIX_MAX /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) @@ -876,7 +877,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev, if (buf) { if (print_prefix(msg, syslog, NULL) + - text_len + 1>= size - len) + text_len + 1 >= size - len) break; if (prefix) @@ -907,7 +908,7 @@ static int syslog_print(char __user *buf, int size) struct log *msg; int len = 0; - text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); + text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; @@ -930,7 +931,8 @@ static int syslog_print(char __user *buf, int size) skip = syslog_partial; msg = log_from_idx(syslog_idx); - n = msg_print_text(msg, syslog_prev, true, text, LOG_LINE_MAX); + n = msg_print_text(msg, syslog_prev, true, text, + LOG_LINE_MAX + PREFIX_MAX); if (n - syslog_partial <= size) { /* message fits into buffer, move forward */ syslog_idx = log_next(syslog_idx); @@ -969,7 +971,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) char *text; int len = 0; - text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); + text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); if (!text) return -ENOMEM; @@ -1022,7 +1024,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear) struct log *msg = log_from_idx(idx); int textlen; - textlen = msg_print_text(msg, prev, true, text, LOG_LINE_MAX); + textlen = msg_print_text(msg, prev, true, text, + LOG_LINE_MAX + PREFIX_MAX); if (textlen < 0) { len = textlen; break; @@ -1367,15 +1370,15 @@ static struct cont { bool flushed:1; /* buffer sealed and committed */ } cont; -static void cont_flush(void) +static void cont_flush(enum log_flags flags) { if (cont.flushed) return; if (cont.len == 0) return; - log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec, - NULL, 0, cont.buf, cont.len); + log_store(cont.facility, cont.level, LOG_NOCONS | flags, + cont.ts_nsec, NULL, 0, cont.buf, cont.len); cont.flushed = true; } @@ -1386,7 +1389,8 @@ static bool cont_add(int facility, int level, const char *text, size_t len) return false; if (cont.len + len > sizeof(cont.buf)) { - cont_flush(); + /* the line gets too long, split it up in separate records */ + cont_flush(LOG_CONT); return false; } @@ -1522,7 +1526,7 @@ asmlinkage int vprintk_emit(int facility, int level, * or another task also prints continuation lines. */ if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) - cont_flush(); + cont_flush(0); /* buffer line if possible, otherwise store it right away */ if (!cont_add(facility, level, text, text_len)) @@ -1540,7 +1544,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (cont.len && cont.owner == current) { if (!(lflags & LOG_PREFIX)) stored = cont_add(facility, level, text, text_len); - cont_flush(); + cont_flush(0); } if (!stored) @@ -1633,7 +1637,8 @@ EXPORT_SYMBOL(printk); #else -#define LOG_LINE_MAX 0 +#define LOG_LINE_MAX 0 +#define PREFIX_MAX 0 static struct cont { size_t len; size_t cons; @@ -1938,7 +1943,7 @@ static enum log_flags console_prev; */ void console_unlock(void) { - static char text[LOG_LINE_MAX]; + static char text[LOG_LINE_MAX + PREFIX_MAX]; static u64 seen_seq; unsigned long flags; bool wake_klogd = false; -- cgit v1.2.3 From 96efedf1491cdf0616e5e4fff0711cebf20f69c7 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Mon, 16 Jul 2012 18:35:29 -0700 Subject: kmsg - avoid warning for CONFIG_PRINTK=n compilations Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index d87ca5c6a989..6c3d5bf14da2 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -216,6 +216,7 @@ struct log { */ static DEFINE_RAW_SPINLOCK(logbuf_lock); +#ifdef CONFIG_PRINTK /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; @@ -228,7 +229,6 @@ static u32 log_first_idx; /* index and sequence number of the next record to store in the buffer */ static u64 log_next_seq; -#ifdef CONFIG_PRINTK static u32 log_next_idx; /* the next printk record to read after the last 'clear' command */ @@ -1635,10 +1635,17 @@ asmlinkage int printk(const char *fmt, ...) } EXPORT_SYMBOL(printk); -#else +#else /* CONFIG_PRINTK */ #define LOG_LINE_MAX 0 #define PREFIX_MAX 0 +#define LOG_LINE_MAX 0 +static u64 syslog_seq; +static u32 syslog_idx; +static enum log_flags syslog_prev; +static u64 log_first_seq; +static u32 log_first_idx; +static u64 log_next_seq; static struct cont { size_t len; size_t cons; -- cgit v1.2.3 From d39f3d77c9b1fe7cc33a14beb4a4849af0a4ac22 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Mon, 16 Jul 2012 18:35:30 -0700 Subject: kmsg - export "continuation record" flag to /dev/kmsg In some cases we are forced to store individual records for a continuation line print. Export a flag to allow the external re-construction of the line. The flag allows us to apply a similar logic externally which is used internally when the console, /proc/kmsg or the syslog() output is printed. $ cat /dev/kmsg 4,165,0,-;Free swap = 0kB 4,166,0,-;Total swap = 0kB 6,167,0,c;[ 4,168,0,+;0 4,169,0,+;1 4,170,0,+;2 4,171,0,+;3 4,172,0,+;] 6,173,0,-;[0 1 2 3 ] 6,174,0,-;Console: colour VGA+ 80x25 6,175,0,-;console [tty0] enabled Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 6c3d5bf14da2..a41106e19077 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -361,6 +361,7 @@ static void log_store(int facility, int level, struct devkmsg_user { u64 seq; u32 idx; + enum log_flags prev; struct mutex lock; char buf[8192]; }; @@ -426,6 +427,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, struct log *msg; u64 ts_usec; size_t i; + char cont = '-'; size_t len; ssize_t ret; @@ -463,8 +465,25 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, msg = log_from_idx(user->idx); ts_usec = msg->ts_nsec; do_div(ts_usec, 1000); - len = sprintf(user->buf, "%u,%llu,%llu;", - (msg->facility << 3) | msg->level, user->seq, ts_usec); + + /* + * If we couldn't merge continuation line fragments during the print, + * export the stored flags to allow an optional external merge of the + * records. Merging the records isn't always neccessarily correct, like + * when we hit a race during printing. In most cases though, it produces + * better readable output. 'c' in the record flags mark the first + * fragment of a line, '+' the following. + */ + if (msg->flags & LOG_CONT && !(user->prev & LOG_CONT)) + cont = 'c'; + else if ((msg->flags & LOG_CONT) || + ((user->prev & LOG_CONT) && !(msg->flags & LOG_PREFIX))) + cont = '+'; + + len = sprintf(user->buf, "%u,%llu,%llu,%c;", + (msg->facility << 3) | msg->level, + user->seq, ts_usec, cont); + user->prev = msg->flags; /* escape non-printable characters */ for (i = 0; i < msg->text_len; i++) { -- cgit v1.2.3 From eab072609e11a357181806ab5a5c309ef6eb76f5 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Mon, 16 Jul 2012 18:35:30 -0700 Subject: kmsg - do not flush partial lines when the console is busy Fragments of continuation lines are flushed to the console immediately. In case the console is locked, the fragment must be queued up in the cont buffer. If the the console is busy and the continuation line is complete, but no part of it was written to the console up to this point, we can just store the entire line as a regular record and free the buffer earlier. If the console is busy and earlier messages are already queued up, we should not flush the fragments of continuation lines, but store them after the queued up messages, to ensure the proper ordering. This keeps the console output better readable in case printk()s race against each other, or we receive over-long continuation lines we need to flush. Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 93 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index a41106e19077..4da2377131b0 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -231,6 +231,11 @@ static u32 log_first_idx; static u64 log_next_seq; static u32 log_next_idx; +/* the next printk record to write to the console */ +static u64 console_seq; +static u32 console_idx; +static enum log_flags console_prev; + /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; static u32 clear_idx; @@ -1386,6 +1391,7 @@ static struct cont { u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ u8 facility; /* log level of first message */ + enum log_flags flags; /* prefix, newline flags */ bool flushed:1; /* buffer sealed and committed */ } cont; @@ -1396,10 +1402,25 @@ static void cont_flush(enum log_flags flags) if (cont.len == 0) return; - log_store(cont.facility, cont.level, LOG_NOCONS | flags, - cont.ts_nsec, NULL, 0, cont.buf, cont.len); - - cont.flushed = true; + if (cont.cons) { + /* + * If a fragment of this line was directly flushed to the + * console; wait for the console to pick up the rest of the + * line. LOG_NOCONS suppresses a duplicated output. + */ + log_store(cont.facility, cont.level, flags | LOG_NOCONS, + cont.ts_nsec, NULL, 0, cont.buf, cont.len); + cont.flags = flags; + cont.flushed = true; + } else { + /* + * If no fragment of this line ever reached the console, + * just submit it to the store and free the buffer. + */ + log_store(cont.facility, cont.level, flags, 0, + NULL, 0, cont.buf, cont.len); + cont.len = 0; + } } static bool cont_add(int facility, int level, const char *text, size_t len) @@ -1418,12 +1439,17 @@ static bool cont_add(int facility, int level, const char *text, size_t len) cont.level = level; cont.owner = current; cont.ts_nsec = local_clock(); + cont.flags = 0; cont.cons = 0; cont.flushed = false; } memcpy(cont.buf + cont.len, text, len); cont.len += len; + + if (cont.len > (sizeof(cont.buf) * 80) / 100) + cont_flush(LOG_CONT); + return true; } @@ -1432,7 +1458,7 @@ static size_t cont_print_text(char *text, size_t size) size_t textlen = 0; size_t len; - if (cont.cons == 0) { + if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) { textlen += print_time(cont.ts_nsec, text); size -= textlen; } @@ -1447,7 +1473,8 @@ static size_t cont_print_text(char *text, size_t size) } if (cont.flushed) { - text[textlen++] = '\n'; + if (cont.flags & LOG_NEWLINE) + text[textlen++] = '\n'; /* got everything, release buffer */ cont.len = 0; } @@ -1545,7 +1572,7 @@ asmlinkage int vprintk_emit(int facility, int level, * or another task also prints continuation lines. */ if (cont.len && (lflags & LOG_PREFIX || cont.owner != current)) - cont_flush(0); + cont_flush(LOG_NEWLINE); /* buffer line if possible, otherwise store it right away */ if (!cont_add(facility, level, text, text_len)) @@ -1563,7 +1590,7 @@ asmlinkage int vprintk_emit(int facility, int level, if (cont.len && cont.owner == current) { if (!(lflags & LOG_PREFIX)) stored = cont_add(facility, level, text, text_len); - cont_flush(0); + cont_flush(LOG_NEWLINE); } if (!stored) @@ -1661,10 +1688,13 @@ EXPORT_SYMBOL(printk); #define LOG_LINE_MAX 0 static u64 syslog_seq; static u32 syslog_idx; +static u64 console_seq; +static u32 console_idx; static enum log_flags syslog_prev; static u64 log_first_seq; static u32 log_first_idx; static u64 log_next_seq; +static enum log_flags console_prev; static struct cont { size_t len; size_t cons; @@ -1948,10 +1978,34 @@ void wake_up_klogd(void) this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); } -/* the next printk record to write to the console */ -static u64 console_seq; -static u32 console_idx; -static enum log_flags console_prev; +static void console_cont_flush(char *text, size_t size) +{ + unsigned long flags; + size_t len; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + + if (!cont.len) + goto out; + + /* + * We still queue earlier records, likely because the console was + * busy. The earlier ones need to be printed before this one, we + * did not flush any fragment so far, so just let it queue up. + */ + if (console_seq < log_next_seq && !cont.cons) + goto out; + + len = cont_print_text(text, size); + raw_spin_unlock(&logbuf_lock); + stop_critical_timings(); + call_console_drivers(cont.level, text, len); + start_critical_timings(); + local_irq_restore(flags); + return; +out: + raw_spin_unlock_irqrestore(&logbuf_lock, flags); +} /** * console_unlock - unlock the console system @@ -1983,19 +2037,7 @@ void console_unlock(void) console_may_schedule = 0; /* flush buffered message fragment immediately to console */ - raw_spin_lock_irqsave(&logbuf_lock, flags); - if (cont.len && (cont.cons < cont.len || cont.flushed)) { - size_t len; - - len = cont_print_text(text, sizeof(text)); - raw_spin_unlock(&logbuf_lock); - stop_critical_timings(); - call_console_drivers(cont.level, text, len); - start_critical_timings(); - local_irq_restore(flags); - } else - raw_spin_unlock_irqrestore(&logbuf_lock, flags); - + console_cont_flush(text, sizeof(text)); again: for (;;) { struct log *msg; @@ -2032,6 +2074,7 @@ skip: * will properly dump everything later. */ msg->flags &= ~LOG_NOCONS; + console_prev = msg->flags; goto skip; } -- cgit v1.2.3 From b2ad368bebc0f772613668e893fa176396e9094c Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Mon, 9 Jul 2012 17:10:39 -0700 Subject: tracing: Fix initialization failure path in tracing_set_tracer() If tracer->init() fails, current code will leave current_tracer pointing to an unusable tracer, which at best makes 'current_tracer' report inaccurate value. Fix the issue by pointing current_tracer to nop tracer, and only update current_tracer with the new one after all the initialization succeeds. Signed-off-by: Anton Vorontsov Acked-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 49249c28690d..44ee11e31b82 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3172,10 +3172,10 @@ static int tracing_set_tracer(const char *buf) } destroy_trace_option_files(topts); - current_trace = t; + current_trace = &nop_trace; - topts = create_trace_option_files(current_trace); - if (current_trace->use_max_tr) { + topts = create_trace_option_files(t); + if (t->use_max_tr) { int cpu; /* we need to make per cpu buffer sizes equivalent */ for_each_tracing_cpu(cpu) { @@ -3195,6 +3195,7 @@ static int tracing_set_tracer(const char *buf) goto out; } + current_trace = t; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); -- cgit v1.2.3 From 21f679404a0c28bd5b1b3aff2a7218bbff4cb43d Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Mon, 9 Jul 2012 17:10:42 -0700 Subject: tracing/function: Introduce persistent trace option This patch introduces 'func_ptrace' option, now available in /sys/kernel/debug/tracing/options when function tracer is selected. The patch also adds some tiny code that calls back to pstore to record the trace. The callback is no-op when PSTORE=n. Signed-off-by: Anton Vorontsov Acked-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_functions.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index c7b0c6a7db09..13770abd7a12 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "trace.h" @@ -74,6 +75,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) preempt_enable_notrace(); } +/* Our two options */ +enum { + TRACE_FUNC_OPT_STACK = 0x1, + TRACE_FUNC_OPT_PSTORE = 0x2, +}; + +static struct tracer_flags func_flags; + static void function_trace_call(unsigned long ip, unsigned long parent_ip) { @@ -97,6 +106,12 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { + /* + * So far tracing doesn't support multiple buffers, so + * we make an explicit call for now. + */ + if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE)) + pstore_ftrace_call(ip, parent_ip); pc = preempt_count(); trace_function(tr, ip, parent_ip, flags, pc); } @@ -158,14 +173,12 @@ static struct ftrace_ops trace_stack_ops __read_mostly = .flags = FTRACE_OPS_FL_GLOBAL, }; -/* Our two options */ -enum { - TRACE_FUNC_OPT_STACK = 0x1, -}; - static struct tracer_opt func_opts[] = { #ifdef CONFIG_STACKTRACE { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, +#endif +#ifdef CONFIG_PSTORE_FTRACE + { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) }, #endif { } /* Always set a last empty entry */ }; @@ -217,6 +230,8 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) register_ftrace_function(&trace_ops); } + return 0; + } else if (bit == TRACE_FUNC_OPT_PSTORE) { return 0; } -- cgit v1.2.3 From f555f1231a69846d57099760f9c361982600ffa2 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Mon, 9 Jul 2012 17:10:46 -0700 Subject: tracing/function: Convert func_set_flag() to a switch statement Since the function accepts just one bit, we can use the switch construction instead of if/else if/... Just a cosmetic change, there should be no functional changes. Suggested-by: Steven Rostedt Signed-off-by: Anton Vorontsov Acked-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_functions.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 13770abd7a12..a426f410c060 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -217,10 +217,11 @@ static void tracing_stop_function_trace(void) static int func_set_flag(u32 old_flags, u32 bit, int set) { - if (bit == TRACE_FUNC_OPT_STACK) { + switch (bit) { + case TRACE_FUNC_OPT_STACK: /* do nothing if already set */ if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) - return 0; + break; if (set) { unregister_ftrace_function(&trace_ops); @@ -230,12 +231,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) register_ftrace_function(&trace_ops); } - return 0; - } else if (bit == TRACE_FUNC_OPT_PSTORE) { - return 0; + break; + case TRACE_FUNC_OPT_PSTORE: + break; + default: + return -EINVAL; } - return -EINVAL; + return 0; } static struct tracer function_trace __read_mostly = -- cgit v1.2.3 From c5857ccf293968348e5eb4ebedc68074de3dcda6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 14 Jul 2012 20:27:52 -0400 Subject: random: remove rand_initialize_irq() With the new interrupt sampling system, we are no longer using the timer_rand_state structure in the irq descriptor, so we can stop initializing it now. [ Merged in fixes from Sedat to find some last missing references to rand_initialize_irq() ] Signed-off-by: "Theodore Ts'o" Signed-off-by: Sedat Dilek --- kernel/irq/manage.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8c548232ba39..5e42eb119677 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -893,22 +893,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) return -ENOSYS; if (!try_module_get(desc->owner)) return -ENODEV; - /* - * Some drivers like serial.c use request_irq() heavily, - * so we have to be careful not to interfere with a - * running system. - */ - if (new->flags & IRQF_SAMPLE_RANDOM) { - /* - * This function might sleep, we want to call it first, - * outside of the atomic block. - * Yes, this might clear the entropy pool if the wrong - * driver is attempted to be loaded, without actually - * installing a new handler, but is this really a problem, - * only the sysadmin is able to do this. - */ - rand_initialize_irq(irq); - } /* * Check whether the interrupt nests into another interrupt @@ -1354,7 +1338,6 @@ EXPORT_SYMBOL(free_irq); * Flags: * * IRQF_SHARED Interrupt is shared - * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy * IRQF_TRIGGER_* Specify active edge(s) or level * */ -- cgit v1.2.3 From 6791457a090d9a234a40b501c2536f0aefaeae4b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 18 Jul 2012 13:18:12 -0400 Subject: printk: Export struct log size and member offsets through vmcoreinfo There are tools like makedumpfile and vmcore-dmesg which can extract kernel log buffer from vmcore. Since we introduced structured logging, that functionality is broken. Now user space tools need to know about "struct log" and offsets of various fields to be able to parse struct log data and extract text message or dictonary. This patch exports some of the fields. Currently I am not exporting log "level" info as that is a bitfield and offsetof() bitfields can't be calculated. But if people start asking for log level info in the output then we probably either need to seprate out "level" or use bit shift operations for flags and level. Signed-off-by: Vivek Goyal Acked-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 4da2377131b0..449364f07a1e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -671,6 +671,15 @@ void log_buf_kexec_setup(void) VMCOREINFO_SYMBOL(log_buf_len); VMCOREINFO_SYMBOL(log_first_idx); VMCOREINFO_SYMBOL(log_next_idx); + /* + * Export struct log size and field offsets. User space tools can + * parse it and detect any changes to structure down the line. + */ + VMCOREINFO_STRUCT_SIZE(log); + VMCOREINFO_OFFSET(log, ts_nsec); + VMCOREINFO_OFFSET(log, len); + VMCOREINFO_OFFSET(log, text_len); + VMCOREINFO_OFFSET(log, dict_len); } #endif -- cgit v1.2.3 From d35be8bab9b0ce44bed4b9453f86ebf64062721e Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 24 May 2012 19:46:26 +0530 Subject: CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume In the event of CPU hotplug, the kernel modifies the cpusets' cpus_allowed masks as and when necessary to ensure that the tasks belonging to the cpusets have some place (online CPUs) to run on. And regular CPU hotplug is destructive in the sense that the kernel doesn't remember the original cpuset configurations set by the user, across hotplug operations. However, suspend/resume (which uses CPU hotplug) is a special case in which the kernel has the responsibility to restore the system (during resume), to exactly the same state it was in before suspend. In order to achieve that, do the following: 1. Don't modify cpusets during suspend/resume. At all. In particular, don't move the tasks from one cpuset to another, and don't modify any cpuset's cpus_allowed mask. So, simply ignore cpusets during the CPU hotplug operations that are carried out in the suspend/resume path. 2. However, cpusets and sched domains are related. We just want to avoid altering cpusets alone. So, to keep the sched domains updated, build a single sched domain (containing all active cpus) during each of the CPU hotplug operations carried out in s/r path, effectively ignoring the cpusets' cpus_allowed masks. (Since userspace is frozen while doing all this, it will go unnoticed.) 3. During the last CPU online operation during resume, build the sched domains by looking up the (unaltered) cpusets' cpus_allowed masks. That will bring back the system to the same original state as it was in before suspend. Ultimately, this will not only solve the cpuset problem related to suspend resume (ie., restores the cpusets to exactly what it was before suspend, by not touching it at all) but also speeds up suspend/resume because we avoid running cpuset update code for every CPU being offlined/onlined. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20120524141611.3692.20155.stgit@srivatsabhat.in.ibm.com Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 3 +++ kernel/sched/core.c | 40 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8c8bd652dd12..746d1eeb5dbe 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2054,6 +2054,9 @@ static void scan_for_empty_cpusets(struct cpuset *root) * (of no affect) on systems that are actively using CPU hotplug * but making no active use of cpusets. * + * The only exception to this is suspend/resume, where we don't + * modify cpusets at all. + * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_active_mask on each CPU hotplug (cpuhp) event. * diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 468bdd44c1ba..4c1d80c6b318 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7097,34 +7097,66 @@ match2: mutex_unlock(&sched_domains_mutex); } +static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ + /* * Update cpusets according to cpu_active mask. If cpusets are * disabled, cpuset_update_active_cpus() becomes a simple wrapper * around partition_sched_domains(). + * + * If we come here as part of a suspend/resume, don't touch cpusets because we + * want to restore it back to its original state upon resume anyway. */ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { + case CPU_ONLINE_FROZEN: + case CPU_DOWN_FAILED_FROZEN: + + /* + * num_cpus_frozen tracks how many CPUs are involved in suspend + * resume sequence. As long as this is not the last online + * operation in the resume sequence, just build a single sched + * domain, ignoring cpusets. + */ + num_cpus_frozen--; + if (likely(num_cpus_frozen)) { + partition_sched_domains(1, NULL, NULL); + break; + } + + /* + * This is the last CPU online operation. So fall through and + * restore the original sched domains by considering the + * cpuset configurations. + */ + case CPU_ONLINE: case CPU_DOWN_FAILED: cpuset_update_active_cpus(); - return NOTIFY_OK; + break; default: return NOTIFY_DONE; } + return NOTIFY_OK; } static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { case CPU_DOWN_PREPARE: cpuset_update_active_cpus(); - return NOTIFY_OK; + break; + case CPU_DOWN_PREPARE_FROZEN: + num_cpus_frozen++; + partition_sched_domains(1, NULL, NULL); + break; default: return NOTIFY_DONE; } + return NOTIFY_OK; } void __init sched_init_smp(void) -- cgit v1.2.3 From 80d1fa6463d934969b7aebf04107fc133463f0f6 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 24 May 2012 19:46:41 +0530 Subject: cpusets, hotplug: Implement cpuset tree traversal in a helper function At present, the functions that deal with cpusets during CPU/Mem hotplug are quite messy, since a lot of the functionality is mixed up without clear separation. And this takes a toll on optimization as well. For example, the function cpuset_update_active_cpus() is called on both CPU offline and CPU online events; and it invokes scan_for_empty_cpusets(), which makes sense only for CPU offline events. And hence, the current code ends up unnecessarily traversing the cpuset tree during CPU online also. As a first step towards cleaning up those functions, encapsulate the cpuset tree traversal in a helper function, so as to facilitate upcoming changes. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20120524141635.3692.893.stgit@srivatsabhat.in.ibm.com Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 746d1eeb5dbe..ba96349aa522 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1989,6 +1989,32 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) move_member_tasks_to_cpuset(cs, parent); } +/* + * Helper function to traverse cpusets. + * It can be used to walk the cpuset tree from top to bottom, completing + * one layer before dropping down to the next (thus always processing a + * node before any of its children). + */ +static struct cpuset *cpuset_next(struct list_head *queue) +{ + struct cpuset *cp; + struct cpuset *child; /* scans child cpusets of cp */ + struct cgroup *cont; + + if (list_empty(queue)) + return NULL; + + cp = list_first_entry(queue, struct cpuset, stack_list); + list_del(queue->next); + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + list_add_tail(&child->stack_list, queue); + } + + return cp; +} + + /* * Walk the specified cpuset subtree and look for empty cpusets. * The tasks of such cpuset must be moved to a parent cpuset. @@ -2008,19 +2034,11 @@ static void scan_for_empty_cpusets(struct cpuset *root) { LIST_HEAD(queue); struct cpuset *cp; /* scans cpusets being updated */ - struct cpuset *child; /* scans child cpusets of cp */ - struct cgroup *cont; static nodemask_t oldmems; /* protected by cgroup_mutex */ list_add_tail((struct list_head *)&root->stack_list, &queue); - while (!list_empty(&queue)) { - cp = list_first_entry(&queue, struct cpuset, stack_list); - list_del(queue.next); - list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { - child = cgroup_cs(cont); - list_add_tail(&child->stack_list, &queue); - } + while ((cp = cpuset_next(&queue)) != NULL) { /* Continue past cpusets with all cpus, mems online */ if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && -- cgit v1.2.3 From 7ddf96b02fe8dd441f452deef879040def5f7b34 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 24 May 2012 19:46:55 +0530 Subject: cpusets, hotplug: Restructure functions that are invoked during hotplug Separate out the cpuset related handling for CPU/Memory online/offline. This also helps us exploit the most obvious and basic level of optimization that any notification mechanism (CPU/Mem online/offline) has to offer us: "We *know* why we have been invoked. So stop pretending that we are lost, and do only the necessary amount of processing!". And while at it, rename scan_for_empty_cpusets() to scan_cpusets_upon_hotplug(), which is more appropriate considering how it is restructured. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20120524141650.3692.48637.stgit@srivatsabhat.in.ibm.com Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 88 +++++++++++++++++++++++++++++++++++++---------------- kernel/sched/core.c | 4 +-- 2 files changed, 63 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ba96349aa522..ba0a4d74d262 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -147,6 +147,12 @@ typedef enum { CS_SPREAD_SLAB, } cpuset_flagbits_t; +/* the type of hotplug event */ +enum hotplug_event { + CPUSET_CPU_OFFLINE, + CPUSET_MEM_OFFLINE, +}; + /* convenient tests for these bits */ static inline int is_cpu_exclusive(const struct cpuset *cs) { @@ -2016,8 +2022,10 @@ static struct cpuset *cpuset_next(struct list_head *queue) /* - * Walk the specified cpuset subtree and look for empty cpusets. - * The tasks of such cpuset must be moved to a parent cpuset. + * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory + * online/offline) and update the cpusets accordingly. + * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such + * cpuset must be moved to a parent cpuset. * * Called with cgroup_mutex held. We take callback_mutex to modify * cpus_allowed and mems_allowed. @@ -2030,38 +2038,58 @@ static struct cpuset *cpuset_next(struct list_head *queue) * that has tasks along with an empty 'mems'. But if we did see such * a cpuset, we'd handle it just like we do if its 'cpus' was empty. */ -static void scan_for_empty_cpusets(struct cpuset *root) +static void +scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) { LIST_HEAD(queue); - struct cpuset *cp; /* scans cpusets being updated */ + struct cpuset *cp; /* scans cpusets being updated */ static nodemask_t oldmems; /* protected by cgroup_mutex */ list_add_tail((struct list_head *)&root->stack_list, &queue); - while ((cp = cpuset_next(&queue)) != NULL) { + switch (event) { + case CPUSET_CPU_OFFLINE: + while ((cp = cpuset_next(&queue)) != NULL) { + + /* Continue past cpusets with all cpus online */ + if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) + continue; + + /* Remove offline cpus from this cpuset. */ + mutex_lock(&callback_mutex); + cpumask_and(cp->cpus_allowed, cp->cpus_allowed, + cpu_active_mask); + mutex_unlock(&callback_mutex); + + /* Move tasks from the empty cpuset to a parent */ + if (cpumask_empty(cp->cpus_allowed)) + remove_tasks_in_empty_cpuset(cp); + else + update_tasks_cpumask(cp, NULL); + } + break; - /* Continue past cpusets with all cpus, mems online */ - if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && - nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) - continue; + case CPUSET_MEM_OFFLINE: + while ((cp = cpuset_next(&queue)) != NULL) { - oldmems = cp->mems_allowed; + /* Continue past cpusets with all mems online */ + if (nodes_subset(cp->mems_allowed, + node_states[N_HIGH_MEMORY])) + continue; - /* Remove offline cpus and mems from this cpuset. */ - mutex_lock(&callback_mutex); - cpumask_and(cp->cpus_allowed, cp->cpus_allowed, - cpu_active_mask); - nodes_and(cp->mems_allowed, cp->mems_allowed, + oldmems = cp->mems_allowed; + + /* Remove offline mems from this cpuset. */ + mutex_lock(&callback_mutex); + nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); - mutex_unlock(&callback_mutex); + mutex_unlock(&callback_mutex); - /* Move tasks from the empty cpuset to a parent */ - if (cpumask_empty(cp->cpus_allowed) || - nodes_empty(cp->mems_allowed)) - remove_tasks_in_empty_cpuset(cp); - else { - update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, &oldmems, NULL); + /* Move tasks from the empty cpuset to a parent */ + if (nodes_empty(cp->mems_allowed)) + remove_tasks_in_empty_cpuset(cp); + else + update_tasks_nodemask(cp, &oldmems, NULL); } } } @@ -2080,8 +2108,11 @@ static void scan_for_empty_cpusets(struct cpuset *root) * * Called within get_online_cpus(). Needs to call cgroup_lock() * before calling generate_sched_domains(). + * + * @cpu_online: Indicates whether this is a CPU online event (true) or + * a CPU offline event (false). */ -void cpuset_update_active_cpus(void) +void cpuset_update_active_cpus(bool cpu_online) { struct sched_domain_attr *attr; cpumask_var_t *doms; @@ -2091,7 +2122,10 @@ void cpuset_update_active_cpus(void) mutex_lock(&callback_mutex); cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); mutex_unlock(&callback_mutex); - scan_for_empty_cpusets(&top_cpuset); + + if (!cpu_online) + scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); + ndoms = generate_sched_domains(&doms, &attr); cgroup_unlock(); @@ -2122,9 +2156,9 @@ static int cpuset_track_online_nodes(struct notifier_block *self, case MEM_OFFLINE: /* * needn't update top_cpuset.mems_allowed explicitly because - * scan_for_empty_cpusets() will update it. + * scan_cpusets_upon_hotplug() will update it. */ - scan_for_empty_cpusets(&top_cpuset); + scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE); break; default: break; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4c1d80c6b318..4b4a63d34396 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7134,7 +7134,7 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, case CPU_ONLINE: case CPU_DOWN_FAILED: - cpuset_update_active_cpus(); + cpuset_update_active_cpus(true); break; default: return NOTIFY_DONE; @@ -7147,7 +7147,7 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, { switch (action) { case CPU_DOWN_PREPARE: - cpuset_update_active_cpus(); + cpuset_update_active_cpus(false); break; case CPU_DOWN_PREPARE_FROZEN: num_cpus_frozen++; -- cgit v1.2.3 From a1cd2b13f754b2c56fb87b8c4912c015f8f57c0c Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 24 May 2012 19:47:03 +0530 Subject: cpusets: Remove/update outdated comments cpuset_track_online_cpus() is no longer present. So remove the outdated comment and replace it with reference to cpuset_update_active_cpus() which is its equivalent. Also, we don't lack memory hot-unplug anymore. And David Rientjes pointed out how it is dealt with. So update that comment as well. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20120524141700.3692.98192.stgit@srivatsabhat.in.ibm.com Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ba0a4d74d262..f33c7153b6d7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2034,9 +2034,8 @@ static struct cpuset *cpuset_next(struct list_head *queue) * before dropping down to the next. It always processes a node before * any of its children. * - * For now, since we lack memory hot unplug, we'll never see a cpuset - * that has tasks along with an empty 'mems'. But if we did see such - * a cpuset, we'd handle it just like we do if its 'cpus' was empty. + * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY + * if all present pages from a node are offlined. */ static void scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) @@ -2137,7 +2136,7 @@ void cpuset_update_active_cpus(bool cpu_online) /* * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. - * See also the previous routine cpuset_track_online_cpus(). + * See cpuset_update_active_cpus() for CPU hotplug handling. */ static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) -- cgit v1.2.3 From 970e178985cadbca660feb02f4d2ee3a09f7fdda Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 12 Jun 2012 05:18:32 +0200 Subject: sched: Improve scalability via 'CPU buddies', which withstand random perturbations Traversing an entire package is not only expensive, it also leads to tasks bouncing all over a partially idle and possible quite large package. Fix that up by assigning a 'buddy' CPU to try to motivate. Each buddy may try to motivate that one other CPU, if it's busy, tough, it may then try its SMT sibling, but that's all this optimization is allowed to cost. Sibling cache buddies are cross-wired to prevent bouncing. 4 socket 40 core + SMT Westmere box, single 30 sec tbench runs, higher is better: clients 1 2 4 8 16 32 64 128 .......................................................................... pre 30 41 118 645 3769 6214 12233 14312 post 299 603 1211 2418 4697 6847 11606 14557 A nice increase in performance. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1339471112.7352.32.camel@marge.simpson.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 39 ++++++++++++++++++++++++++++++++++++++- kernel/sched/fair.c | 28 +++++++--------------------- 2 files changed, 45 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4b4a63d34396..536b213f0ce5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6024,6 +6024,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this * allows us to avoid some pointer chasing select_idle_sibling(). * + * Iterate domains and sched_groups downward, assigning CPUs to be + * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing + * due to random perturbation self canceling, ie sw buddies pull + * their counterpart to their CPU's hw counterpart. + * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if * two cpus are in the same cache domain, see cpus_share_cache(). @@ -6037,8 +6042,40 @@ static void update_top_cache_domain(int cpu) int id = cpu; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) + if (sd) { + struct sched_domain *tmp = sd; + struct sched_group *sg, *prev; + bool right; + + /* + * Traverse to first CPU in group, and count hops + * to cpu from there, switching direction on each + * hop, never ever pointing the last CPU rightward. + */ + do { + id = cpumask_first(sched_domain_span(tmp)); + prev = sg = tmp->groups; + right = 1; + + while (cpumask_first(sched_group_cpus(sg)) != id) + sg = sg->next; + + while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) { + prev = sg; + sg = sg->next; + right = !right; + } + + /* A CPU went down, never point back to domain start. */ + if (right && cpumask_first(sched_group_cpus(sg->next)) == id) + right = false; + + sg = right ? sg->next : prev; + tmp->idle_buddy = cpumask_first(sched_group_cpus(sg)); + } while ((tmp = tmp->child)); + id = cpumask_first(sched_domain_span(sd)); + } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_id, cpu) = id; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c099cc6eebe3..dd00aaf44fda 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2637,8 +2637,6 @@ static int select_idle_sibling(struct task_struct *p, int target) int cpu = smp_processor_id(); int prev_cpu = task_cpu(p); struct sched_domain *sd; - struct sched_group *sg; - int i; /* * If the task is going to be woken-up on this cpu and if it is @@ -2655,29 +2653,17 @@ static int select_idle_sibling(struct task_struct *p, int target) return prev_cpu; /* - * Otherwise, iterate the domains and find an elegible idle cpu. + * Otherwise, check assigned siblings to find an elegible idle cpu. */ sd = rcu_dereference(per_cpu(sd_llc, target)); - for_each_lower_domain(sd) { - sg = sd->groups; - do { - if (!cpumask_intersects(sched_group_cpus(sg), - tsk_cpus_allowed(p))) - goto next; - - for_each_cpu(i, sched_group_cpus(sg)) { - if (!idle_cpu(i)) - goto next; - } - target = cpumask_first_and(sched_group_cpus(sg), - tsk_cpus_allowed(p)); - goto done; -next: - sg = sg->next; - } while (sg != sd->groups); + for_each_lower_domain(sd) { + if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p))) + continue; + if (idle_cpu(sd->idle_buddy)) + return sd->idle_buddy; } -done: + return target; } -- cgit v1.2.3 From 85c1e7dae165acd004429f81fe52bfbf55b57a98 Mon Sep 17 00:00:00 2001 From: Prashanth Nageshappa Date: Tue, 19 Jun 2012 17:47:34 +0530 Subject: sched: Reorder 'struct lb_env' members to reduce its size Members of 'struct lb_env' are not in appropriate order to reuse compiler added padding on 64bit architectures. In this patch we reorder those struct members and help reduce the size of the structure from 96 bytes to 80 bytes on 64 bit architectures. Suggested-by: Srivatsa Vaddagiri Signed-off-by: Prashanth Nageshappa Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4FE06DDE.7000403@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dd00aaf44fda..9361669d4242 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3058,8 +3058,8 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; struct lb_env { struct sched_domain *sd; - int src_cpu; struct rq *src_rq; + int src_cpu; int dst_cpu; struct rq *dst_rq; -- cgit v1.2.3 From bbf18b19495942cc730e8ff11fc3ffadf20cbfe1 Mon Sep 17 00:00:00 2001 From: Prashanth Nageshappa Date: Tue, 19 Jun 2012 17:52:07 +0530 Subject: sched: Reset loop counters if all tasks are pinned and we need to redo load balance While load balancing, if all tasks on the source runqueue are pinned, we retry after excluding the corresponding source cpu. However, loop counters env.loop and env.loop_break are not reset before retrying, which can lead to failure in moving the tasks. In this patch we reset env.loop and env.loop_break to their inital values before we retry. Signed-off-by: Prashanth Nageshappa Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4FE06EEF.2090709@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9361669d4242..f9f9aa0edf3c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4288,8 +4288,11 @@ more_balance: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) + if (!cpumask_empty(cpus)) { + env.loop = 0; + env.loop_break = sched_nr_migrate_break; goto redo; + } goto out_balanced; } } -- cgit v1.2.3 From 88b8dac0a14c511ff41486b83a8c3d688936eec0 Mon Sep 17 00:00:00 2001 From: Srivatsa Vaddagiri Date: Tue, 19 Jun 2012 17:43:15 +0530 Subject: sched: Improve balance_cpu() to consider other cpus in its group as target of (pinned) task Current load balance scheme requires only one cpu in a sched_group (balance_cpu) to look at other peer sched_groups for imbalance and pull tasks towards itself from a busy cpu. Tasks thus pulled by balance_cpu could later get picked up by cpus that are in the same sched_group as that of balance_cpu. This scheme however fails to pull tasks that are not allowed to run on balance_cpu (but are allowed to run on other cpus in its sched_group). That can affect fairness and in some worst case scenarios cause starvation. Consider a two core (2 threads/core) system running tasks as below: Core0 Core1 / \ / \ C0 C1 C2 C3 | | | | v v v v F0 T1 F1 [idle] T2 F0 = SCHED_FIFO task (pinned to C0) F1 = SCHED_FIFO task (pinned to C2) T1 = SCHED_OTHER task (pinned to C1) T2 = SCHED_OTHER task (pinned to C1 and C2) F1 could become a cpu hog, which will starve T2 unless C1 pulls it. Between C0 and C1 however, C0 is required to look for imbalance between cores, which will fail to pull T2 towards Core0. T2 will starve eternally in this case. The same scenario can arise in presence of non-rt tasks as well (say we replace F1 with high irq load). We tackle this problem by having balance_cpu move pinned tasks to one of its sibling cpus (where they can run). We first check if load balance goal can be met by ignoring pinned tasks, failing which we retry move_tasks() with a new env->dst_cpu. This patch modifies load balance semantics on who can move load towards a given cpu in a given sched_domain. Before this patch, a given_cpu or a ilb_cpu acting on behalf of an idle given_cpu is responsible for moving load to given_cpu. With this patch applied, balance_cpu can in addition decide on moving some load to a given_cpu. There is a remote possibility that excess load could get moved as a result of this (balance_cpu and given_cpu/ilb_cpu deciding *independently* and at *same* time to move some load to a given_cpu). However we should see less of such conflicting decisions in practice and moreover subsequent load balance cycles should correct the excess load moved to given_cpu. Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Prashanth Nageshappa Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4FE06CDB.2060605@linux.vnet.ibm.com [ minor edits ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f9f9aa0edf3c..22321db64952 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3054,6 +3054,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 +#define LBF_SOME_PINNED 0x04 struct lb_env { struct sched_domain *sd; @@ -3064,6 +3065,8 @@ struct lb_env { int dst_cpu; struct rq *dst_rq; + struct cpumask *dst_grpmask; + int new_dst_cpu; enum cpu_idle_type idle; long imbalance; unsigned int flags; @@ -3131,9 +3134,31 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 3) are cache-hot on their current CPU. */ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { + int new_dst_cpu; + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + + /* + * Remember if this task can be migrated to any other cpu in + * our sched_group. We may want to revisit it if we couldn't + * meet load balance goals by pulling other tasks on src_cpu. + * + * Also avoid computing new_dst_cpu if we have already computed + * one in current iteration. + */ + if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) + return 0; + + new_dst_cpu = cpumask_first_and(env->dst_grpmask, + tsk_cpus_allowed(p)); + if (new_dst_cpu < nr_cpu_ids) { + env->flags |= LBF_SOME_PINNED; + env->new_dst_cpu = new_dst_cpu; + } return 0; } + + /* Record that we found atleast one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; if (task_running(env->src_rq, p)) { @@ -4213,7 +4238,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, active_balance = 0; + int ld_moved, cur_ld_moved, active_balance = 0; + int lb_iterations, max_lb_iterations; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -4223,11 +4249,13 @@ static int load_balance(int this_cpu, struct rq *this_rq, .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, + .dst_grpmask = sched_group_cpus(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, }; cpumask_copy(cpus, cpu_active_mask); + max_lb_iterations = cpumask_weight(env.dst_grpmask); schedstat_inc(sd, lb_count[idle]); @@ -4253,6 +4281,7 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); ld_moved = 0; + lb_iterations = 1; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -4270,7 +4299,13 @@ more_balance: double_rq_lock(this_rq, busiest); if (!env.loop) update_h_load(env.src_cpu); - ld_moved += move_tasks(&env); + + /* + * cur_ld_moved - load moved in current iteration + * ld_moved - cumulative load moved across iterations + */ + cur_ld_moved = move_tasks(&env); + ld_moved += cur_ld_moved; double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -4282,8 +4317,43 @@ more_balance: /* * some other cpu did the load balance for us. */ - if (ld_moved && this_cpu != smp_processor_id()) - resched_cpu(this_cpu); + if (cur_ld_moved && env.dst_cpu != smp_processor_id()) + resched_cpu(env.dst_cpu); + + /* + * Revisit (affine) tasks on src_cpu that couldn't be moved to + * us and move them to an alternate dst_cpu in our sched_group + * where they can run. The upper limit on how many times we + * iterate on same src_cpu is dependent on number of cpus in our + * sched_group. + * + * This changes load balance semantics a bit on who can move + * load to a given_cpu. In addition to the given_cpu itself + * (or a ilb_cpu acting on its behalf where given_cpu is + * nohz-idle), we now have balance_cpu in a position to move + * load to given_cpu. In rare situations, this may cause + * conflicts (balance_cpu and given_cpu/ilb_cpu deciding + * _independently_ and at _same_ time to move some load to + * given_cpu) causing exceess load to be moved to given_cpu. + * This however should not happen so much in practice and + * moreover subsequent load balance cycles should correct the + * excess load moved. + */ + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && + lb_iterations++ < max_lb_iterations) { + + this_rq = cpu_rq(env.new_dst_cpu); + env.dst_rq = this_rq; + env.dst_cpu = env.new_dst_cpu; + env.flags &= ~LBF_SOME_PINNED; + env.loop = 0; + env.loop_break = sched_nr_migrate_break; + /* + * Go back to "more_balance" rather than "redo" since we + * need to continue with same src_cpu. + */ + goto more_balance; + } /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { -- cgit v1.2.3 From 8323f26ce3425460769605a6aece7a174edaa7d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jun 2012 13:36:05 +0200 Subject: sched: Fix race in task_group() Stefan reported a crash on a kernel before a3e5d1091c1 ("sched: Don't call task_group() too many times in set_task_rq()"), he found the reason to be that the multiple task_group() invocations in set_task_rq() returned different values. Looking at all that I found a lack of serialization and plain wrong comments. The below tries to fix it using an extra pointer which is updated under the appropriate scheduler locks. Its not pretty, but I can't really see another way given how all the cgroup stuff works. Reported-and-tested-by: Stefan Bader Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1340364965.18025.71.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 9 ++++++++- kernel/sched/sched.h | 23 ++++++++++------------- 2 files changed, 18 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 536b213f0ce5..5d011ef4c0df 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1096,7 +1096,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. * * sched_move_task() holds both and thus holding either pins the cgroup, - * see set_task_rq(). + * see task_group(). * * Furthermore, all task_rq users should acquire both locks, see * task_rq_lock(). @@ -7658,6 +7658,7 @@ void sched_destroy_group(struct task_group *tg) */ void sched_move_task(struct task_struct *tsk) { + struct task_group *tg; int on_rq, running; unsigned long flags; struct rq *rq; @@ -7672,6 +7673,12 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); + tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, + lockdep_is_held(&tsk->sighand->siglock)), + struct task_group, css); + tg = autogroup_task_group(tsk, tg); + tsk->sched_task_group = tg; + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) tsk->sched_class->task_move_group(tsk, on_rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 55844f24435a..c35a1a7dd4d6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -538,22 +538,19 @@ extern int group_balance_cpu(struct sched_group *sg); /* * Return the group to which this tasks belongs. * - * We use task_subsys_state_check() and extend the RCU verification with - * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each - * task it moves into the cgroup. Therefore by holding either of those locks, - * we pin the task to the current cgroup. + * We cannot use task_subsys_state() and friends because the cgroup + * subsystem changes that value before the cgroup_subsys::attach() method + * is called, therefore we cannot pin it and might observe the wrong value. + * + * The same is true for autogroup's p->signal->autogroup->tg, the autogroup + * core changes this before calling sched_move_task(). + * + * Instead we use a 'copy' which is updated from sched_move_task() while + * holding both task_struct::pi_lock and rq::lock. */ static inline struct task_group *task_group(struct task_struct *p) { - struct task_group *tg; - struct cgroup_subsys_state *css; - - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock)); - tg = container_of(css, struct task_group, css); - - return autogroup_task_group(p, tg); + return p->sched_task_group; } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -- cgit v1.2.3 From 895dd92c032e1604aa0d7afaef7716e058343b67 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Thu, 12 Jul 2012 14:14:29 +0400 Subject: sched: Deliver sched_switch events to the current task Otherwise they can't be filtered for a defined task: perf record -e sched:sched_switch ./foo This command doesn't report any events without this patch. I think it isn't a security concern if someone knows who will be executed next - this can already be observed by polling /proc state. By default perf is disabled for non-root users in any case. I need these events for profiling sleep times. sched_switch is used for getting callchains and sched_stat_* is used for getting time periods. These events are combined in user space, then it can be analyzed by perf tools. Signed-off-by: Andrew Vagin Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Cc: Arun Sharma Link: http://lkml.kernel.org/r/1342088069-1005148-1-git-send-email-avagin@openvz.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 468bdd44c1ba..ad732b56ba70 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1910,12 +1910,12 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { + trace_sched_switch(prev, next); sched_info_switch(prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); - trace_sched_switch(prev, next); } /** -- cgit v1.2.3 From 8ded2bbc1845e19c771eb55209aab166ef011243 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Wed, 25 Jul 2012 10:40:34 -0400 Subject: posix_types.h: Cleanup stale __NFDBITS and related definitions Recently, glibc made a change to suppress sign-conversion warnings in FD_SET (glibc commit ceb9e56b3d1). This uncovered an issue with the kernel's definition of __NFDBITS if applications #include after including . A build failure would be seen when passing the -Werror=sign-compare and -D_FORTIFY_SOURCE=2 flags to gcc. It was suggested that the kernel should either match the glibc definition of __NFDBITS or remove that entirely. The current in-kernel uses of __NFDBITS can be replaced with BITS_PER_LONG, and there are no uses of the related __FDELT and __FDMASK defines. Given that, we'll continue the cleanup that was started with commit 8b3d1cda4f5f ("posix_types: Remove fd_set macros") and drop the remaining unused macros. Additionally, linux/time.h has similar macros defined that expand to nothing so we'll remove those at the same time. Reported-by: Jeff Law Suggested-by: Linus Torvalds CC: Signed-off-by: Josh Boyer [ .. and fix up whitespace as per akpm ] Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index d17f6c4ddfa9..f65345f9e5bb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -483,7 +483,7 @@ static void close_files(struct files_struct * files) rcu_read_unlock(); for (;;) { unsigned long set; - i = j * __NFDBITS; + i = j * BITS_PER_LONG; if (i >= fdt->max_fds) break; set = fdt->open_fds[j++]; -- cgit v1.2.3 From f403072c6108e15f319a4a5036b650c77760522c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Jul 2012 20:22:12 +0200 Subject: uprobes: Don't recheck vma/f_mapping in write_opcode() write_opcode() rechecks valid_vma() and ->f_mapping, this is pointless. The caller, register_for_each_vma() or uprobe_mmap(), has already done these checks under mmap_sem. To clarify, uprobe_mmap() checks valid_vma() only, but we can rely on build_probe_list(vm_file->f_mapping->host). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Anton Arapov Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120729182212.GA20304@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f93532748bca..a2b32a51d0a2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -206,33 +206,16 @@ static int write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { struct page *old_page, *new_page; - struct address_space *mapping; void *vaddr_old, *vaddr_new; struct vm_area_struct *vma; - struct uprobe *uprobe; int ret; + retry: /* Read the page with vaddr into memory */ ret = get_user_pages(NULL, mm, vaddr, 1, 0, 0, &old_page, &vma); if (ret <= 0) return ret; - ret = -EINVAL; - - /* - * We are interested in text pages only. Our pages of interest - * should be mapped for read and execute only. We desist from - * adding probes in write mapped pages since the breakpoints - * might end up in the file copy. - */ - if (!valid_vma(vma, is_swbp_insn(&opcode))) - goto put_out; - - uprobe = container_of(auprobe, struct uprobe, arch); - mapping = uprobe->inode->i_mapping; - if (mapping != vma->vm_file->f_mapping) - goto put_out; - ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) -- cgit v1.2.3 From c517ee744b96e441d9c731e245f83c6d08dc0a19 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Jul 2012 20:22:16 +0200 Subject: uprobes: __replace_page() should not use page_address_in_vma() page_address_in_vma(old_page) in __replace_page() is ugly and wrong. The caller already knows the correct virtual address, this page was found by get_user_pages(vaddr). However, page_address_in_vma() can actually fail if page->mapping was cleared by __delete_from_page_cache() after get_user_pages() returns. But this means the race with page reclaim, write_opcode() should not fail, it should retry and read this page again. Probably the race with remove_mapping() is not possible due to page_freeze_refs() logic, but afaics at least shmem_writepage()->shmem_delete_from_page_cache() can clear ->mapping. We could change __replace_page() to return -EAGAIN in this case, but it would be better to simply use the caller's vaddr and rely on page_check_address(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Anton Arapov Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120729182216.GA20311@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a2b32a51d0a2..6fda7996892b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -127,22 +127,19 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) * based on replace_page in mm/ksm.c * * @vma: vma that holds the pte pointing to page + * @addr: address the old @page is mapped at * @page: the cowed page we are replacing by kpage * @kpage: the modified page we replace page by * * Returns 0 on success, -EFAULT on failure. */ -static int __replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) +static int __replace_page(struct vm_area_struct *vma, unsigned long addr, + struct page *page, struct page *kpage) { struct mm_struct *mm = vma->vm_mm; - unsigned long addr; spinlock_t *ptl; pte_t *ptep; - addr = page_address_in_vma(page, vma); - if (addr == -EFAULT) - return -EFAULT; - ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) return -EAGAIN; @@ -243,7 +240,7 @@ retry: goto unlock_out; lock_page(new_page); - ret = __replace_page(vma, old_page, new_page); + ret = __replace_page(vma, vaddr, old_page, new_page); unlock_page(new_page); unlock_out: -- cgit v1.2.3 From 089ba999dc881a7549d97c55ac9e0052d061867d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Jul 2012 20:22:18 +0200 Subject: uprobes: Kill write_opcode()->lock_page(new_page) write_opcode() does lock_page(new_page) for no reason. Nobody can see this page until __replace_page() exposes it under ptl lock, and we do nothing with this page after pte_unmap_unlock(). If nothing else, the similar code in do_wp_page() doesn't lock the new page for page_add_new_anon_rmap/set_pte_at_notify. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Anton Arapov Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120729182218.GA20315@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6fda7996892b..23c562b7fc2e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -239,9 +239,7 @@ retry: if (ret) goto unlock_out; - lock_page(new_page); ret = __replace_page(vma, vaddr, old_page, new_page); - unlock_page(new_page); unlock_out: unlock_page(old_page); -- cgit v1.2.3 From 9f92448ceeea5326db7d114005a7e7ac03904edf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Jul 2012 20:22:20 +0200 Subject: uprobes: Clean up and document write_opcode()->lock_page(old_page) The comment above write_opcode()->lock_page(old_page) tells about the race with do_wp_page(). I don't really understand which exactly race it means, but afaics this lock_page() was not enough to close all races with do_wp_page(). Anyway, since: 77fc4af1b59d uprobes: Change register_for_each_vma() to take mm->mmap_sem for writing this code is always called with ->mmap_sem held for writing, so we can forget about do_wp_page(). However, we can't simply remove this lock_page(), and the only (afaics) reason is __replace_page()->try_to_free_swap(). Nothing in write_opcode() needs it, move it into __replace_page() and fix the comment. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Anton Arapov Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120729182220.GA20322@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 23c562b7fc2e..5db150b306d2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -139,10 +139,15 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; pte_t *ptep; + int err; + /* freeze PageSwapCache() for try_to_free_swap() below */ + lock_page(page); + + err = -EAGAIN; ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) - return -EAGAIN; + goto unlock; get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); @@ -162,7 +167,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, put_page(page); pte_unmap_unlock(ptep, ptl); - return 0; + err = 0; + unlock: + unlock_page(page); + return err; } /** @@ -216,15 +224,10 @@ retry: ret = -ENOMEM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr); if (!new_page) - goto put_out; + goto put_old; __SetPageUptodate(new_page); - /* - * lock page will serialize against do_wp_page()'s - * PageAnon() handling - */ - lock_page(old_page); /* copy the page now that we've got it stable */ vaddr_old = kmap_atomic(old_page); vaddr_new = kmap_atomic(new_page); @@ -237,15 +240,13 @@ retry: ret = anon_vma_prepare(vma); if (ret) - goto unlock_out; + goto put_new; ret = __replace_page(vma, vaddr, old_page, new_page); -unlock_out: - unlock_page(old_page); +put_new: page_cache_release(new_page); - -put_out: +put_old: put_page(old_page); if (unlikely(ret == -EAGAIN)) -- cgit v1.2.3 From 665605a2a207dbe1fa429b474f932d6ea138ba92 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Jul 2012 20:22:29 +0200 Subject: uprobes: Uprobe_mmap/munmap needs list_for_each_entry_safe() The bug was introduced by me in 449d0d7c ("uprobes: Simplify the usage of uprobe->pending_list"). Yes, we do not care about uprobe->pending_list after return and nobody can remove the current list entry, but put_uprobe(uprobe) can actually free it and thus we need list_for_each_safe(). Reported-by: Srikar Dronamraju Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Anton Arapov Link: http://lkml.kernel.org/r/20120729182229.GA20329@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 5db150b306d2..bed2161620d7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1010,7 +1010,7 @@ static void build_probe_list(struct inode *inode, struct list_head *head) int uprobe_mmap(struct vm_area_struct *vma) { struct list_head tmp_list; - struct uprobe *uprobe; + struct uprobe *uprobe, *u; struct inode *inode; int ret, count; @@ -1028,7 +1028,7 @@ int uprobe_mmap(struct vm_area_struct *vma) ret = 0; count = 0; - list_for_each_entry(uprobe, &tmp_list, pending_list) { + list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!ret) { loff_t vaddr = vma_address(vma, uprobe->offset); @@ -1076,7 +1076,7 @@ int uprobe_mmap(struct vm_area_struct *vma) void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct list_head tmp_list; - struct uprobe *uprobe; + struct uprobe *uprobe, *u; struct inode *inode; if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) @@ -1093,7 +1093,7 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon mutex_lock(uprobes_mmap_hash(inode)); build_probe_list(inode, &tmp_list); - list_for_each_entry(uprobe, &tmp_list, pending_list) { + list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { loff_t vaddr = vma_address(vma, uprobe->offset); if (vaddr >= start && vaddr < end) { -- cgit v1.2.3 From 2fd611a991391a6050cbd139201a2e12fc306540 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Jul 2012 20:22:31 +0200 Subject: uprobes: Suppress uprobe_munmap() from mmput() uprobe_munmap() does get_user_pages() and it is also called from the final mmput()->exit_mmap() path. This slows down exit/mmput() for no reason, and I think it is simply dangerous/wrong to try to fault-in a page into the dying mm. If nothing else, this happens after the last sync_mm_rss(), afaics handle_mm_fault() can change the task->rss_stat and make the subsequent check_mm() unhappy. Change uprobe_munmap() to check mm->mm_users != 0. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Anton Arapov Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120729182231.GA20336@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bed2161620d7..9db9cdf8ff34 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1082,6 +1082,9 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) return; + if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ + return; + if (!atomic_read(&vma->vm_mm->uprobes_state.count)) return; -- cgit v1.2.3 From aefd8933d445abf7ff0d4027c624737898827bcd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Jul 2012 20:22:33 +0200 Subject: uprobes: Fix overflow in vma_address()/find_active_uprobe() vma->vm_pgoff is "unsigned long", it should be promoted to loff_t before the multiplication to avoid the overflow. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Anton Arapov Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120729182233.GA20339@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 9db9cdf8ff34..fb961d5e205c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -117,7 +117,7 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) loff_t vaddr; vaddr = vma->vm_start + offset; - vaddr -= vma->vm_pgoff << PAGE_SHIFT; + vaddr -= (loff_t)vma->vm_pgoff << PAGE_SHIFT; return vaddr; } @@ -1450,7 +1450,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) inode = vma->vm_file->f_mapping->host; offset = bp_vaddr - vma->vm_start; - offset += (vma->vm_pgoff << PAGE_SHIFT); + offset += (loff_t)vma->vm_pgoff << PAGE_SHIFT; uprobe = find_uprobe(inode, offset); } -- cgit v1.2.3 From 891c39708144bbe401b4aa151bffd0fe41b1dafd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Jul 2012 20:22:40 +0200 Subject: uprobes: Teach build_probe_list() to consider the range Currently build_probe_list() builds the list of all uprobes attached to the given inode, and the caller should filter out those who don't fall into the [start,end) range, this is sub-optimal. This patch turns find_least_offset_node() into find_node_in_range() which returns the first node inside the [min,max] range, and changes build_probe_list() to use this node as a starting point for rb_prev() and rb_next() to find all other nodes the caller needs. The resulting list is no longer sorted but we do not care. This can speed up both build_probe_list() and the callers, but there is another reason to introduce find_node_in_range(). It can be used to figure out whether the given vma has uprobes or not, this will be needed soon. While at it, shift INIT_LIST_HEAD(tmp_list) into build_probe_list(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Anton Arapov Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120729182240.GA20352@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 103 +++++++++++++++++++++++------------------------- 1 file changed, 50 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index fb961d5e205c..2ef123306183 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -939,59 +939,66 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume put_uprobe(uprobe); } -/* - * Of all the nodes that correspond to the given inode, return the node - * with the least offset. - */ -static struct rb_node *find_least_offset_node(struct inode *inode) +static struct rb_node * +find_node_in_range(struct inode *inode, loff_t min, loff_t max) { - struct uprobe u = { .inode = inode, .offset = 0}; struct rb_node *n = uprobes_tree.rb_node; - struct rb_node *close_node = NULL; - struct uprobe *uprobe; - int match; while (n) { - uprobe = rb_entry(n, struct uprobe, rb_node); - match = match_uprobe(&u, uprobe); - - if (uprobe->inode == inode) - close_node = n; + struct uprobe *u = rb_entry(n, struct uprobe, rb_node); - if (!match) - return close_node; - - if (match < 0) + if (inode < u->inode) { n = n->rb_left; - else + } else if (inode > u->inode) { n = n->rb_right; + } else { + if (max < u->offset) + n = n->rb_left; + else if (min > u->offset) + n = n->rb_right; + else + break; + } } - return close_node; + return n; } /* - * For a given inode, build a list of probes that need to be inserted. + * For a given range in vma, build a list of probes that need to be inserted. */ -static void build_probe_list(struct inode *inode, struct list_head *head) +static void build_probe_list(struct inode *inode, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct list_head *head) { - struct uprobe *uprobe; + loff_t min, max; unsigned long flags; - struct rb_node *n; - - spin_lock_irqsave(&uprobes_treelock, flags); - - n = find_least_offset_node(inode); + struct rb_node *n, *t; + struct uprobe *u; - for (; n; n = rb_next(n)) { - uprobe = rb_entry(n, struct uprobe, rb_node); - if (uprobe->inode != inode) - break; + INIT_LIST_HEAD(head); + min = ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + start - vma->vm_start; + max = min + (end - start) - 1; - list_add(&uprobe->pending_list, head); - atomic_inc(&uprobe->ref); + spin_lock_irqsave(&uprobes_treelock, flags); + n = find_node_in_range(inode, min, max); + if (n) { + for (t = n; t; t = rb_prev(t)) { + u = rb_entry(t, struct uprobe, rb_node); + if (u->inode != inode || u->offset < min) + break; + list_add(&u->pending_list, head); + atomic_inc(&u->ref); + } + for (t = n; (t = rb_next(t)); ) { + u = rb_entry(t, struct uprobe, rb_node); + if (u->inode != inode || u->offset > max) + break; + list_add(&u->pending_list, head); + atomic_inc(&u->ref); + } } - spin_unlock_irqrestore(&uprobes_treelock, flags); } @@ -1021,9 +1028,8 @@ int uprobe_mmap(struct vm_area_struct *vma) if (!inode) return 0; - INIT_LIST_HEAD(&tmp_list); mutex_lock(uprobes_mmap_hash(inode)); - build_probe_list(inode, &tmp_list); + build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); ret = 0; count = 0; @@ -1032,11 +1038,6 @@ int uprobe_mmap(struct vm_area_struct *vma) if (!ret) { loff_t vaddr = vma_address(vma, uprobe->offset); - if (vaddr < vma->vm_start || vaddr >= vma->vm_end) { - put_uprobe(uprobe); - continue; - } - ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); /* * We can race against uprobe_register(), see the @@ -1092,21 +1093,17 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon if (!inode) return; - INIT_LIST_HEAD(&tmp_list); mutex_lock(uprobes_mmap_hash(inode)); - build_probe_list(inode, &tmp_list); + build_probe_list(inode, vma, start, end, &tmp_list); list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { loff_t vaddr = vma_address(vma, uprobe->offset); - - if (vaddr >= start && vaddr < end) { - /* - * An unregister could have removed the probe before - * unmap. So check before we decrement the count. - */ - if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) - atomic_dec(&vma->vm_mm->uprobes_state.count); - } + /* + * An unregister could have removed the probe before + * unmap. So check before we decrement the count. + */ + if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) + atomic_dec(&vma->vm_mm->uprobes_state.count); put_uprobe(uprobe); } mutex_unlock(uprobes_mmap_hash(inode)); -- cgit v1.2.3 From cb113b47d098185f3f1f67e8300d05ddce842b66 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Jul 2012 20:22:42 +0200 Subject: uprobes: Introduce vaddr_to_offset(vma, vaddr) Add the new helper, vaddr_to_offset(vma, vaddr) which returns the offset in vma->vm_file this vaddr is mapped at. Change build_probe_list() and find_active_uprobe() to use the new helper, the next patch adds another user. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Anton Arapov Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120729182242.GA20355@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2ef123306183..b03256cced52 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -122,6 +122,11 @@ static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) return vaddr; } +static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) +{ + return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start); +} + /** * __replace_page - replace page in vma by new page. * based on replace_page in mm/ksm.c @@ -978,7 +983,7 @@ static void build_probe_list(struct inode *inode, struct uprobe *u; INIT_LIST_HEAD(head); - min = ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + start - vma->vm_start; + min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; spin_lock_irqsave(&uprobes_treelock, flags); @@ -1442,12 +1447,9 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) vma = find_vma(mm, bp_vaddr); if (vma && vma->vm_start <= bp_vaddr) { if (valid_vma(vma, false)) { - struct inode *inode; - loff_t offset; + struct inode *inode = vma->vm_file->f_mapping->host; + loff_t offset = vaddr_to_offset(vma, bp_vaddr); - inode = vma->vm_file->f_mapping->host; - offset = bp_vaddr - vma->vm_start; - offset += (loff_t)vma->vm_pgoff << PAGE_SHIFT; uprobe = find_uprobe(inode, offset); } -- cgit v1.2.3 From f4d6dfe55115efe981b4b5f37183ddccaaa792f0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Jul 2012 20:22:44 +0200 Subject: uprobes: Fix register_for_each_vma()->vma_address() check 1. register_for_each_vma() checks that vma_address() == vaddr, but this is not enough. We should also ensure that vaddr >= vm_start, find_vma() guarantees "vaddr < vm_end" only. 2. After the prevous changes, register_for_each_vma() is the only reason why vma_address() has to return loff_t, all other users know that we have the valid mapping at this offset and thus the overflow is not possible. Change the code to use vaddr_to_offset() instead, imho this looks more clean/understandable and now we can change vma_address(). 3. While at it, remove the unnecessary type-cast. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Anton Arapov Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120729182244.GA20362@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b03256cced52..cdc3c951251c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -823,12 +823,13 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) goto free; down_write(&mm->mmap_sem); - vma = find_vma(mm, (unsigned long)info->vaddr); - if (!vma || !valid_vma(vma, is_register)) + vma = find_vma(mm, info->vaddr); + if (!vma || !valid_vma(vma, is_register) || + vma->vm_file->f_mapping->host != uprobe->inode) goto unlock; - if (vma->vm_file->f_mapping->host != uprobe->inode || - vma_address(vma, uprobe->offset) != info->vaddr) + if (vma->vm_start > info->vaddr || + vaddr_to_offset(vma, info->vaddr) != uprobe->offset) goto unlock; if (is_register) { -- cgit v1.2.3 From 57683f72b8c01c53c85fe13e82fe1feb3a06bcd5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Jul 2012 20:22:47 +0200 Subject: uprobes: Rename vma_address() and make it return "unsigned long" 1. vma_address() returns loff_t, this looks confusing and this is unnecessary after the previous change. Make it return "ulong", all callers truncate the result anyway. 2. Its name conflicts with mm/rmap.c:vma_address(), rename it to offset_to_vaddr(), this matches vaddr_to_offset(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Anton Arapov Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120729182247.GA20365@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index cdc3c951251c..bb30a4fc5050 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -112,14 +112,9 @@ static bool valid_vma(struct vm_area_struct *vma, bool is_register) return false; } -static loff_t vma_address(struct vm_area_struct *vma, loff_t offset) +static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset) { - loff_t vaddr; - - vaddr = vma->vm_start + offset; - vaddr -= (loff_t)vma->vm_pgoff << PAGE_SHIFT; - - return vaddr; + return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT); } static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) @@ -775,7 +770,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) curr = info; info->mm = vma->vm_mm; - info->vaddr = vma_address(vma, offset); + info->vaddr = offset_to_vaddr(vma, offset); } mutex_unlock(&mapping->i_mmap_mutex); @@ -1042,7 +1037,7 @@ int uprobe_mmap(struct vm_area_struct *vma) list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!ret) { - loff_t vaddr = vma_address(vma, uprobe->offset); + unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); /* @@ -1103,7 +1098,7 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon build_probe_list(inode, vma, start, end, &tmp_list); list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { - loff_t vaddr = vma_address(vma, uprobe->offset); + unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); /* * An unregister could have removed the probe before * unmap. So check before we decrement the count. -- cgit v1.2.3 From 194f8dcbe9629d8e9346cf96345a9c0bbf0e67ae Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 29 Jul 2012 20:22:49 +0200 Subject: uprobes: __replace_page() needs munlock_vma_page() Like do_wp_page(), __replace_page() should do munlock_vma_page() for the case when the old page still has other !VM_LOCKED mappings. Unfortunately this needs mm/internal.h. Also, move put_page() outside of ptl lock. This doesn't really matter but looks a bit better. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju Cc: Anton Arapov Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/20120729182249.GA20372@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bb30a4fc5050..c08a22d02f72 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -32,6 +32,7 @@ #include /* try_to_free_swap */ #include /* user_enable_single_step */ #include /* notifier mechanism */ +#include "../../mm/internal.h" /* munlock_vma_page */ #include @@ -141,7 +142,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep; int err; - /* freeze PageSwapCache() for try_to_free_swap() below */ + /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(page); err = -EAGAIN; @@ -164,9 +165,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, page_remove_rmap(page); if (!page_mapped(page)) try_to_free_swap(page); - put_page(page); pte_unmap_unlock(ptep, ptl); + if (vma->vm_flags & VM_LOCKED) + munlock_vma_page(page); + put_page(page); + err = 0; unlock: unlock_page(page); -- cgit v1.2.3 From f1fd75bfa07822b1de314062baff3280419a8bf4 Mon Sep 17 00:00:00 2001 From: Sasikantha babu Date: Mon, 30 Jul 2012 14:39:08 -0700 Subject: prctl: remove redunant assignment of "error" to zero Just setting the "error" to error number is enough on failure and It doesn't require to set "error" variable to zero in each switch case, since it was already initialized with zero. And also removed return 0 in switch case with break statement Signed-off-by: Sasikantha babu Acked-by: Kees Cook Acked-by: Serge E. Hallyn Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 2d39a84cd857..b04ae0390df3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2015,7 +2015,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, break; } me->pdeath_signal = arg2; - error = 0; break; case PR_GET_PDEATHSIG: error = put_user(me->pdeath_signal, (int __user *)arg2); @@ -2029,7 +2028,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, break; } set_dumpable(me->mm, arg2); - error = 0; break; case PR_SET_UNALIGN: @@ -2056,10 +2054,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_TIMING: if (arg2 != PR_TIMING_STATISTICAL) error = -EINVAL; - else - error = 0; break; - case PR_SET_NAME: comm[sizeof(me->comm)-1] = 0; if (strncpy_from_user(comm, (char __user *)arg2, @@ -2067,20 +2062,19 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EFAULT; set_task_comm(me, comm); proc_comm_connector(me); - return 0; + break; case PR_GET_NAME: get_task_comm(comm, me); if (copy_to_user((char __user *)arg2, comm, sizeof(comm))) return -EFAULT; - return 0; + break; case PR_GET_ENDIAN: error = GET_ENDIAN(me, arg2); break; case PR_SET_ENDIAN: error = SET_ENDIAN(me, arg2); break; - case PR_GET_SECCOMP: error = prctl_get_seccomp(); break; @@ -2108,7 +2102,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, current->default_timer_slack_ns; else current->timer_slack_ns = arg2; - error = 0; break; case PR_MCE_KILL: if (arg4 | arg5) @@ -2134,7 +2127,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, default: return -EINVAL; } - error = 0; break; case PR_MCE_KILL_GET: if (arg2 | arg3 | arg4 | arg5) @@ -2153,7 +2145,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; - error = 0; break; case PR_GET_CHILD_SUBREAPER: error = put_user(me->signal->is_child_subreaper, -- cgit v1.2.3 From 54b501992dd2a839e94e76aa392c392b55080ce8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 30 Jul 2012 14:39:18 -0700 Subject: coredump: warn about unsafe suid_dumpable / core_pattern combo When suid_dumpable=2, detect unsafe core_pattern settings and warn when they are seen. Signed-off-by: Kees Cook Suggested-by: Andrew Morton Cc: Alexander Viro Cc: Alan Cox Cc: "Eric W. Biederman" Cc: Doug Ledford Cc: Serge Hallyn Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4ab11879aeb4..b46f496405e4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -174,6 +174,11 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +static int proc_dostring_coredump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; @@ -410,7 +415,7 @@ static struct ctl_table kern_table[] = { .data = core_pattern, .maxlen = CORENAME_MAX_SIZE, .mode = 0644, - .proc_handler = proc_dostring, + .proc_handler = proc_dostring_coredump, }, { .procname = "core_pipe_limit", @@ -1498,7 +1503,7 @@ static struct ctl_table fs_table[] = { .data = &suid_dumpable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax_coredump, .extra1 = &zero, .extra2 = &two, }, @@ -2009,6 +2014,34 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, do_proc_dointvec_minmax_conv, ¶m); } +static void validate_coredump_safety(void) +{ + if (suid_dumpable == SUID_DUMPABLE_SAFE && + core_pattern[0] != '/' && core_pattern[0] != '|') { + printk(KERN_WARNING "Unsafe core_pattern used with "\ + "suid_dumpable=2. Pipe handler or fully qualified "\ + "core dump path required.\n"); + } +} + +static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!error) + validate_coredump_safety(); + return error; +} + +static int proc_dostring_coredump(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dostring(table, write, buffer, lenp, ppos); + if (!error) + validate_coredump_safety(); + return error; +} + static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos, -- cgit v1.2.3 From 190320c3b6640d4104650f55ff69611e050ea06b Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Mon, 30 Jul 2012 14:39:58 -0700 Subject: panic: fix a possible deadlock in panic() panic_lock is meant to ensure that panic processing takes place only on one cpu; if any of the other cpus encounter a panic, they will spin waiting to be shut down. However, this causes a regression in this scenario: 1. Cpu 0 encounters a panic and acquires the panic_lock and proceeds with the panic processing. 2. There is an interrupt on cpu 0 that also encounters an error condition and invokes panic. 3. This second invocation fails to acquire the panic_lock and enters the infinite while loop in panic_smp_self_stop. Thus all panic processing is stopped, and the cpu is stuck for eternity in the while(1) inside panic_smp_self_stop. To address this, disable local interrupts with local_irq_disable before acquiring the panic_lock. This will prevent interrupt handlers from executing during the panic processing, thus avoiding this particular problem. Signed-off-by: Vikram Mulukutla Reviewed-by: Stephen Boyd Cc: Michael Holzheu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index d2a5f4ecc6dd..e1b2822fff97 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -74,6 +74,14 @@ void panic(const char *fmt, ...) long i, i_next = 0; int state = 0; + /* + * Disable local interrupts. This will prevent panic_smp_self_stop + * from deadlocking the first cpu that invokes the panic, since + * there is nothing to prevent an interrupt handler (that runs + * after the panic_lock is acquired) from invoking panic again. + */ + local_irq_disable(); + /* * It's possible to come here directly from a panic-assertion and * not have preempt disabled. Some functions called from here want -- cgit v1.2.3 From 45226e944ce071d0231949f2fea90969437cd2dc Mon Sep 17 00:00:00 2001 From: Sameer Nanda Date: Mon, 30 Jul 2012 14:40:00 -0700 Subject: NMI watchdog: fix for lockup detector breakage on resume On the suspend/resume path the boot CPU does not go though an offline->online transition. This breaks the NMI detector post-resume since it depends on PMU state that is lost when the system gets suspended. Fix this by forcing a CPU offline->online transition for the lockup detector on the boot CPU during resume. To provide more context, we enable NMI watchdog on Chrome OS. We have seen several reports of systems freezing up completely which indicated that the NMI watchdog was not firing for some reason. Debugging further, we found a simple way of repro'ing system freezes -- issuing the command 'tasket 1 sh -c "echo nmilockup > /proc/breakme"' after the system has been suspended/resumed one or more times. With this patch in place, the system freeze result in panics, as expected. These panics provide a nice stack trace for us to debug the actual issue causing the freeze. [akpm@linux-foundation.org: fiddle with code comment] [akpm@linux-foundation.org: make lockup_detector_bootcpu_resume() conditional on CONFIG_SUSPEND] [akpm@linux-foundation.org: fix section errors] Signed-off-by: Sameer Nanda Cc: Ingo Molnar Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Don Zickus Cc: Mandeep Singh Baines Cc: Srivatsa S. Bhat Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/suspend.c | 3 +++ kernel/watchdog.c | 21 +++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c8b7446b27df..1da39ea248fd 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -178,6 +178,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); + /* Kick the lockup detector */ + lockup_detector_bootcpu_resume(); + Enable_cpus: enable_nonboot_cpus(); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4b1dfba70f7c..69add8a9da68 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -575,7 +575,7 @@ out: /* * Create/destroy watchdog threads as CPUs come and go: */ -static int __cpuinit +static int cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; @@ -610,10 +610,27 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block __cpuinitdata cpu_nfb = { +static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; +#ifdef CONFIG_SUSPEND +/* + * On exit from suspend we force an offline->online transition on the boot CPU + * so that the PMU state that was lost while in suspended state gets set up + * properly for the boot CPU. This information is required for restarting the + * NMI watchdog. + */ +void lockup_detector_bootcpu_resume(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + + cpu_callback(&cpu_nfb, CPU_DEAD_FROZEN, cpu); + cpu_callback(&cpu_nfb, CPU_UP_PREPARE_FROZEN, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE_FROZEN, cpu); +} +#endif + void __init lockup_detector_init(void) { void *cpu = (void *)(long)smp_processor_id(); -- cgit v1.2.3 From b57b44ae698944ffc6161352b8ff5c9cf9c592e2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 30 Jul 2012 14:40:03 -0700 Subject: kernel/sys.c: avoid argv_free(NULL) If argv_split() failed, the code will end up calling argv_free(NULL). Fix it up and clean things up a bit. Addresses Coverity report 703573. Cc: Cyrill Gorcunov Cc: Kees Cook Cc: Serge Hallyn Cc: "Eric W. Biederman" Cc: WANG Cong Cc: Alan Cox Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index b04ae0390df3..241507f23eca 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2186,46 +2186,52 @@ static void argv_cleanup(struct subprocess_info *info) argv_free(info->argv); } -/** - * orderly_poweroff - Trigger an orderly system poweroff - * @force: force poweroff if command execution fails - * - * This may be called from any context to trigger a system shutdown. - * If the orderly shutdown fails, it will force an immediate shutdown. - */ -int orderly_poweroff(bool force) +static int __orderly_poweroff(void) { int argc; - char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); + char **argv; static char *envp[] = { "HOME=/", "PATH=/sbin:/bin:/usr/sbin:/usr/bin", NULL }; - int ret = -ENOMEM; + int ret; + argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); if (argv == NULL) { printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", __func__, poweroff_cmd); - goto out; + return -ENOMEM; } ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, NULL, argv_cleanup, NULL); -out: - if (likely(!ret)) - return 0; - if (ret == -ENOMEM) argv_free(argv); - if (force) { + return ret; +} + +/** + * orderly_poweroff - Trigger an orderly system poweroff + * @force: force poweroff if command execution fails + * + * This may be called from any context to trigger a system shutdown. + * If the orderly shutdown fails, it will force an immediate shutdown. + */ +int orderly_poweroff(bool force) +{ + int ret = __orderly_poweroff(); + + if (ret && force) { printk(KERN_WARNING "Failed to start orderly shutdown: " "forcing the issue\n"); - /* I guess this should try to kick off some daemon to - sync and poweroff asap. Or not even bother syncing - if we're doing an emergency shutdown? */ + /* + * I guess this should try to kick off some daemon to sync and + * poweroff asap. Or not even bother syncing if we're doing an + * emergency shutdown? + */ emergency_sync(); kernel_power_off(); } -- cgit v1.2.3 From cdf53441368cc02ee4aa8a8343a5dc25132836f0 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Mon, 30 Jul 2012 14:40:08 -0700 Subject: kmsg: /dev/kmsg - properly return possible copy_from_user() failure Reported-by: Andrew Morton Signed-off-by: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 50c96b5651b6..852269adad25 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -389,8 +389,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, line = buf; for (i = 0; i < count; i++) { - if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) + if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) { + ret = -EFAULT; goto out; + } line += iv[i].iov_len; } -- cgit v1.2.3 From acc8fa41ad31c576cdbc569cc3e0e443b1b98b44 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 30 Jul 2012 14:40:09 -0700 Subject: printk: add generic functions to find KERN_ headers The current form of a KERN_ is "<.>". Add printk_get_level and printk_skip_level functions to handle these formats. These functions centralize tests of KERN_ so a future modification can change the KERN_ style and shorten the number of bytes consumed by these headers. [akpm@linux-foundation.org: fix build error and warning] Signed-off-by: Joe Perches Cc: Kay Sievers Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 852269adad25..0d882a2f231e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1487,6 +1487,7 @@ asmlinkage int vprintk_emit(int facility, int level, size_t text_len; enum log_flags lflags = 0; unsigned long flags; + int kern_level; int this_cpu; int printed_len = 0; @@ -1543,17 +1544,20 @@ asmlinkage int vprintk_emit(int facility, int level, } /* strip syslog prefix and extract log level or control flags */ - if (text[0] == '<' && text[1] && text[2] == '>') { - switch (text[1]) { + kern_level = printk_get_level(text); + if (kern_level) { + const char *end_of_header = printk_skip_level(text); + switch (kern_level) { case '0' ... '7': if (level == -1) - level = text[1] - '0'; + level = kern_level - '0'; case 'd': /* KERN_DEFAULT */ lflags |= LOG_PREFIX; case 'c': /* KERN_CONT */ - text += 3; - text_len -= 3; + break; } + text_len -= end_of_header - text; + text = (char *)end_of_header; } if (level == -1) -- cgit v1.2.3 From 088a52aac810655c1db1e40331e4936946701e9c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 30 Jul 2012 14:40:19 -0700 Subject: printk: only look for prefix levels in kernel messages vprintk_emit() prefix parsing should only be done for internal kernel messages. This allows existing behavior to be kept in all cases. Signed-off-by: Joe Perches Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 0d882a2f231e..6a76ab9d4476 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1487,7 +1487,6 @@ asmlinkage int vprintk_emit(int facility, int level, size_t text_len; enum log_flags lflags = 0; unsigned long flags; - int kern_level; int this_cpu; int printed_len = 0; @@ -1543,21 +1542,24 @@ asmlinkage int vprintk_emit(int facility, int level, lflags |= LOG_NEWLINE; } - /* strip syslog prefix and extract log level or control flags */ - kern_level = printk_get_level(text); - if (kern_level) { - const char *end_of_header = printk_skip_level(text); - switch (kern_level) { - case '0' ... '7': - if (level == -1) - level = kern_level - '0'; - case 'd': /* KERN_DEFAULT */ - lflags |= LOG_PREFIX; - case 'c': /* KERN_CONT */ - break; + /* strip kernel syslog prefix and extract log level or control flags */ + if (facility == 0) { + int kern_level = printk_get_level(text); + + if (kern_level) { + const char *end_of_header = printk_skip_level(text); + switch (kern_level) { + case '0' ... '7': + if (level == -1) + level = kern_level - '0'; + case 'd': /* KERN_DEFAULT */ + lflags |= LOG_PREFIX; + case 'c': /* KERN_CONT */ + break; + } + text_len -= end_of_header - text; + text = (char *)end_of_header; } - text_len -= end_of_header - text; - text = (char *)end_of_header; } if (level == -1) -- cgit v1.2.3 From 79c743dd1e8de61c31f484c0a1b48930543044b3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 30 Jul 2012 14:42:17 -0700 Subject: kernel/kmod.c: document call_usermodehelper_fns() a bit This function's interface is, uh, subtle. Attempt to apologise for it. Cc: WANG Cong Cc: Cyrill Gorcunov Cc: Kees Cook Cc: Serge Hallyn Cc: "Eric W. Biederman" Cc: Alan Cox Cc: Oleg Nesterov Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index ff2c7cb86d77..2a8351516a0e 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -577,6 +577,12 @@ unlock: return retval; } +/* + * call_usermodehelper_fns() will not run the caller-provided cleanup function + * if a memory allocation failure is experienced. So the caller might need to + * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform + * the necessaary cleanup within the caller. + */ int call_usermodehelper_fns( char *path, char **argv, char **envp, int wait, int (*init)(struct subprocess_info *info, struct cred *new), -- cgit v1.2.3 From 0f20784d4ba3f88ca33b703b23372d8ccf6dbd42 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 30 Jul 2012 14:42:20 -0700 Subject: kmod: avoid deadlock from recursive kmod call The system deadlocks (at least since 2.6.10) when call_usermodehelper(UMH_WAIT_EXEC) request triggers call_usermodehelper(UMH_WAIT_PROC) request. This is because "khelper thread is waiting for the worker thread at wait_for_completion() in do_fork() since the worker thread was created with CLONE_VFORK flag" and "the worker thread cannot call complete() because do_execve() is blocked at UMH_WAIT_PROC request" and "the khelper thread cannot start processing UMH_WAIT_PROC request because the khelper thread is waiting for the worker thread at wait_for_completion() in do_fork()". The easiest example to observe this deadlock is to use a corrupted /sbin/hotplug binary (like shown below). # : > /tmp/dummy # chmod 755 /tmp/dummy # echo /tmp/dummy > /proc/sys/kernel/hotplug # modprobe whatever call_usermodehelper("/tmp/dummy", UMH_WAIT_EXEC) is called from kobject_uevent_env() in lib/kobject_uevent.c upon loading/unloading a module. do_execve("/tmp/dummy") triggers a call to request_module("binfmt-0000") from search_binary_handler() which in turn calls call_usermodehelper(UMH_WAIT_PROC). In order to avoid deadlock, as a for-now and easy-to-backport solution, do not try to call wait_for_completion() in call_usermodehelper_exec() if the worker thread was created by khelper thread with CLONE_VFORK flag. Future and fundamental solution might be replacing singleton khelper thread with some workqueue so that recursive calls up to max_active dependency loop can be handled without deadlock. [akpm@linux-foundation.org: add comment to kmod_thread_locker] Signed-off-by: Tetsuo Handa Cc: Arjan van de Ven Acked-by: Rusty Russell Cc: Tejun Heo Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 2a8351516a0e..6f99aead66c6 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -45,6 +45,13 @@ extern int max_threads; static struct workqueue_struct *khelper_wq; +/* + * kmod_thread_locker is used for deadlock avoidance. There is no explicit + * locking to protect this global - it is private to the singleton khelper + * thread and should only ever be modified by that thread. + */ +static const struct task_struct *kmod_thread_locker; + #define CAP_BSET (void *)1 #define CAP_PI (void *)2 @@ -221,6 +228,13 @@ fail: return 0; } +static int call_helper(void *data) +{ + /* Worker thread started blocking khelper thread. */ + kmod_thread_locker = current; + return ____call_usermodehelper(data); +} + static void call_usermodehelper_freeinfo(struct subprocess_info *info) { if (info->cleanup) @@ -295,9 +309,12 @@ static void __call_usermodehelper(struct work_struct *work) if (wait == UMH_WAIT_PROC) pid = kernel_thread(wait_for_helper, sub_info, CLONE_FS | CLONE_FILES | SIGCHLD); - else - pid = kernel_thread(____call_usermodehelper, sub_info, + else { + pid = kernel_thread(call_helper, sub_info, CLONE_VFORK | SIGCHLD); + /* Worker thread stopped blocking khelper thread. */ + kmod_thread_locker = NULL; + } switch (wait) { case UMH_NO_WAIT: @@ -548,6 +565,16 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) retval = -EBUSY; goto out; } + /* + * Worker thread must not wait for khelper thread at below + * wait_for_completion() if the thread was created with CLONE_VFORK + * flag, for khelper thread is already waiting for the thread at + * wait_for_completion() in do_fork(). + */ + if (wait != UMH_NO_WAIT && current == kmod_thread_locker) { + retval = -EBUSY; + goto out; + } sub_info->complete = &done; sub_info->wait = wait; -- cgit v1.2.3 From b2412b7fa7a3816fa8633dc2ff19f1a90aabe423 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Mon, 30 Jul 2012 14:42:30 -0700 Subject: fork: use vma_pages() to simplify the code The current code can be replaced by vma_pages(). So use it to simplify the code. [akpm@linux-foundation.org: initialise `len' at its definition site] Signed-off-by: Huang Shijie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ff1cad3b7bdc..2c1802948a38 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -391,8 +391,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) } charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned long len; - len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + unsigned long len = vma_pages(mpnt); + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; -- cgit v1.2.3 From 87bec58a52652e2eb2a575692a40f9466c7bd31b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 30 Jul 2012 14:42:31 -0700 Subject: revert "sched: Fix fork() error path to not crash" To make way for "fork: fix error handling in dup_task()", which fixes the errors more completely. Cc: Salman Qazi Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 2c1802948a38..088025bd1925 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -304,17 +304,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) } err = arch_dup_task_struct(tsk, orig); - - /* - * We defer looking at err, because we will need this setup - * for the clean up path to work correctly. - */ - tsk->stack = ti; - setup_thread_stack(tsk, orig); - if (err) goto out; + tsk->stack = ti; + + setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); stackend = end_of_stack(tsk); -- cgit v1.2.3 From f19b9f74b7ea3b21ddcee55d852a6488239608a4 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 30 Jul 2012 14:42:33 -0700 Subject: fork: fix error handling in dup_task() The function dup_task() may fail at the following function calls in the following order. 0) alloc_task_struct_node() 1) alloc_thread_info_node() 2) arch_dup_task_struct() Error by 0) is not a matter, it can just return. But error by 1) requires releasing task_struct allocated by 0) before it returns. Likewise, error by 2) requires releasing task_struct and thread_info allocated by 0) and 1). The existing error handling calls free_task_struct() and free_thread_info() which do not only release task_struct and thread_info, but also call architecture specific arch_release_task_struct() and arch_release_thread_info(). The problem is that task_struct and thread_info are not fully initialized yet at this point, but arch_release_task_struct() and arch_release_thread_info() are called with them. For example, x86 defines its own arch_release_task_struct() that releases a task_xstate. If alloc_thread_info_node() fails in dup_task(), arch_release_task_struct() is called with task_struct which is just allocated and filled with garbage in this error handling. This actually happened with tools/testing/fault-injection/failcmd.sh # env FAILCMD_TYPE=fail_page_alloc \ ./tools/testing/fault-injection/failcmd.sh --times=100 \ --min-order=0 --ignore-gfp-wait=0 \ -- make -C tools/testing/selftests/ run_tests In order to fix this issue, make free_{task_struct,thread_info}() not to call arch_release_{task_struct,thread_info}() and call arch_release_{task_struct,thread_info}() implicitly where needed. Default arch_release_task_struct() and arch_release_thread_info() are defined as empty by default. So this change only affects the architectures which implement their own arch_release_task_struct() or arch_release_thread_info() as listed below. arch_release_task_struct(): x86, sh arch_release_thread_info(): mn10300, tile Signed-off-by: Akinobu Mita Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: David Howells Cc: Koichi Yasutake Cc: Paul Mundt Cc: Chris Metcalf Cc: Salman Qazi Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 088025bd1925..8efac1fe56bc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -114,6 +114,10 @@ int nr_processes(void) return total; } +void __weak arch_release_task_struct(struct task_struct *tsk) +{ +} + #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR static struct kmem_cache *task_struct_cachep; @@ -122,17 +126,17 @@ static inline struct task_struct *alloc_task_struct_node(int node) return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); } -void __weak arch_release_task_struct(struct task_struct *tsk) { } - static inline void free_task_struct(struct task_struct *tsk) { - arch_release_task_struct(tsk); kmem_cache_free(task_struct_cachep, tsk); } #endif +void __weak arch_release_thread_info(struct thread_info *ti) +{ +} + #ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR -void __weak arch_release_thread_info(struct thread_info *ti) { } /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a @@ -150,7 +154,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, static inline void free_thread_info(struct thread_info *ti) { - arch_release_thread_info(ti); free_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else @@ -164,7 +167,6 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, static void free_thread_info(struct thread_info *ti) { - arch_release_thread_info(ti); kmem_cache_free(thread_info_cache, ti); } @@ -205,10 +207,12 @@ static void account_kernel_stack(struct thread_info *ti, int account) void free_task(struct task_struct *tsk) { account_kernel_stack(tsk->stack, -1); + arch_release_thread_info(tsk->stack); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); + arch_release_task_struct(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -298,14 +302,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) return NULL; ti = alloc_thread_info_node(tsk, node); - if (!ti) { - free_task_struct(tsk); - return NULL; - } + if (!ti) + goto free_tsk; err = arch_dup_task_struct(tsk, orig); if (err) - goto out; + goto free_ti; tsk->stack = ti; @@ -333,8 +335,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) return tsk; -out: +free_ti: free_thread_info(ti); +free_tsk: free_task_struct(tsk); return NULL; } -- cgit v1.2.3 From 63dca8d5b5ef7effb58b8d6892a024125c0fab0b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 30 Jul 2012 14:42:36 -0700 Subject: kdump: append newline to the last lien of vmcoreinfo note The last line of vmcoreinfo note does not end with \n. Parsing all the lines in note becomes easier if all lines end with \n instead of trying to special case the last line. I know at least one tool, vmcore-dmesg in kexec-tools tree which made the assumption that all lines end with \n. I think it is a good idea to fix it. Signed-off-by: Vivek Goyal Cc: "Eric W. Biederman" Cc: Atsushi Kumagai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 4e2e472f6aeb..0668d58d6413 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1424,7 +1424,7 @@ static void update_vmcoreinfo_note(void) void crash_save_vmcoreinfo(void) { - vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); update_vmcoreinfo_note(); } -- cgit v1.2.3 From fd4b616b0fbb77e3f349e7d60914f2b7c7e39f9c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 30 Jul 2012 14:42:48 -0700 Subject: sysctl: suppress kmemleak messages register_sysctl_table() is a strange function, as it makes internal allocations (a header) to register a sysctl_table. This header is a handle to the table that is created, and can be used to unregister the table. But if the table is permanent and never unregistered, the header acts the same as a static variable. Unfortunately, this allocation of memory that is never expected to be freed fools kmemleak in thinking that we have leaked memory. For those sysctl tables that are never unregistered, and have no pointer referencing them, kmemleak will think that these are memory leaks: unreferenced object 0xffff880079fb9d40 (size 192): comm "swapper/0", pid 0, jiffies 4294667316 (age 12614.152s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x73/0x98 [] kmemleak_alloc_recursive.constprop.42+0x16/0x18 [] __kmalloc+0x107/0x153 [] kzalloc.constprop.8+0xe/0x10 [] __register_sysctl_paths+0xe1/0x160 [] register_sysctl_paths+0x1b/0x1d [] register_sysctl_table+0x18/0x1a [] sysctl_init+0x10/0x14 [] proc_sys_init+0x2f/0x31 [] proc_root_init+0xa5/0xa7 [] start_kernel+0x3d0/0x40a [] x86_64_start_reservations+0xae/0xb2 [] x86_64_start_kernel+0x102/0x111 [] 0xffffffffffffffff The sysctl_base_table used by sysctl itself is one such instance that registers the table to never be unregistered. Use kmemleak_not_leak() to suppress the kmemleak false positive. Signed-off-by: Steven Rostedt Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b46f496405e4..97186b99b0e4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -1556,7 +1557,10 @@ static struct ctl_table dev_table[] = { int __init sysctl_init(void) { - register_sysctl_table(sysctl_base_table); + struct ctl_table_header *hdr; + + hdr = register_sysctl_table(sysctl_base_table); + kmemleak_not_leak(hdr); return 0; } -- cgit v1.2.3 From 25353b3377d5a75d4b830477bb90a3691155de72 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 30 Jul 2012 14:42:49 -0700 Subject: taskstats: check nla_reserve() return Addresses https://bugzilla.kernel.org/show_bug.cgi?id=44621 Reported-by: Signed-off-by: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index e66046456f4f..d0a32796550f 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -436,6 +436,11 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, sizeof(struct cgroupstats)); + if (na == NULL) { + rc = -EMSGSIZE; + goto err; + } + stats = nla_data(na); memset(stats, 0, sizeof(*stats)); -- cgit v1.2.3 From 65fed8f6f23070b56d0ed3841173ddd410130a89 Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Mon, 30 Jul 2012 14:42:58 -0700 Subject: resource: make sure requested range is included in the root range When the requested range is outside of the root range the logic in __reserve_region_with_split will cause an infinite recursion which will overflow the stack as seen in the warning bellow. This particular stack overflow was caused by requesting the (100000000-107ffffff) range while the root range was (0-ffffffff). In this case __request_resource would return the whole root range as conflict range (i.e. 0-ffffffff). Then, the logic in __reserve_region_with_split would continue the recursion requesting the new range as (conflict->end+1, end) which incidentally in this case equals the originally requested range. This patch aborts looking for an usable range when the request does not intersect with the root range. When the request partially overlaps with the root range, it ajust the request to fall in the root range and then continues with the new request. When the request is modified or aborted errors and a stack trace are logged to allow catching the errors in the upper layers. [ 5.968374] WARNING: at kernel/sched.c:4129 sub_preempt_count+0x63/0x89() [ 5.975150] Modules linked in: [ 5.978184] Pid: 1, comm: swapper Not tainted 3.0.22-mid27-00004-gb72c817 #46 [ 5.985324] Call Trace: [ 5.987759] [] ? console_unlock+0x17b/0x18d [ 5.992891] [] warn_slowpath_common+0x48/0x5d [ 5.998194] [] ? sub_preempt_count+0x63/0x89 [ 6.003412] [] warn_slowpath_null+0xf/0x13 [ 6.008453] [] sub_preempt_count+0x63/0x89 [ 6.013499] [] _raw_spin_unlock+0x27/0x3f [ 6.018453] [] add_partial+0x36/0x3b [ 6.022973] [] deactivate_slab+0x96/0xb4 [ 6.027842] [] __slab_alloc.isra.54.constprop.63+0x204/0x241 [ 6.034456] [] ? kzalloc.constprop.5+0x29/0x38 [ 6.039842] [] ? kzalloc.constprop.5+0x29/0x38 [ 6.045232] [] kmem_cache_alloc_trace+0x51/0xb0 [ 6.050710] [] ? kzalloc.constprop.5+0x29/0x38 [ 6.056100] [] kzalloc.constprop.5+0x29/0x38 [ 6.061320] [] __reserve_region_with_split+0x1c/0xd1 [ 6.067230] [] __reserve_region_with_split+0xc6/0xd1 ... [ 7.179057] [] __reserve_region_with_split+0xc6/0xd1 [ 7.184970] [] reserve_region_with_split+0x30/0x42 [ 7.190709] [] e820_reserve_resources_late+0xd1/0xe9 [ 7.196623] [] pcibios_resource_survey+0x23/0x2a [ 7.202184] [] pcibios_init+0x23/0x35 [ 7.206789] [] pci_subsys_init+0x3f/0x44 [ 7.211659] [] do_one_initcall+0x72/0x122 [ 7.216615] [] ? pci_legacy_init+0x3d/0x3d [ 7.221659] [] kernel_init+0xa6/0x118 [ 7.226265] [] ? start_kernel+0x334/0x334 [ 7.231223] [] kernel_thread_helper+0x6/0x10 Signed-off-by: Octavian Purdila Signed-off-by: Ram Pai Cc: Jesse Barnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index dc8b47764443..34d45886ee84 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -7,6 +7,8 @@ * Arbitrary resource management. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -791,8 +793,28 @@ void __init reserve_region_with_split(struct resource *root, resource_size_t start, resource_size_t end, const char *name) { + int abort = 0; + write_lock(&resource_lock); - __reserve_region_with_split(root, start, end, name); + if (root->start > start || root->end < end) { + pr_err("requested range [0x%llx-0x%llx] not in root %pr\n", + (unsigned long long)start, (unsigned long long)end, + root); + if (start > root->end || end < root->start) + abort = 1; + else { + if (end > root->end) + end = root->end; + if (start < root->start) + start = root->start; + pr_err("fixing request to [0x%llx-0x%llx]\n", + (unsigned long long)start, + (unsigned long long)end); + } + dump_stack(); + } + if (!abort) + __reserve_region_with_split(root, start, end, name); write_unlock(&resource_lock); } -- cgit v1.2.3 From 44de9d0cad41f2c51ef26916842be046b582dcc9 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Tue, 31 Jul 2012 16:41:49 -0700 Subject: mm: account the total_vm in the vm_stat_account() vm_stat_account() accounts the shared_vm, stack_vm and reserved_vm now. But we can also account for total_vm in the vm_stat_account() which makes the code tidy. Even for mprotect_fixup(), we can get the right result in the end. Signed-off-by: Huang Shijie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8efac1fe56bc..aaa8813c45d1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -381,10 +381,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { - long pages = vma_pages(mpnt); - mm->total_vm -= pages; vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, - -pages); + -vma_pages(mpnt)); continue; } charge = 0; -- cgit v1.2.3 From 3965c9ae47d64aadf6f13b6fcd37767b83c0689a Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 31 Jul 2012 16:41:52 -0700 Subject: mm: prepare for removal of obsolete /proc/sys/vm/nr_pdflush_threads Since per-BDI flusher threads were introduced in 2.6, the pdflush mechanism is not used any more. But the old interface exported through /proc/sys/vm/nr_pdflush_threads still exists and is obviously useless. For back-compatibility, printk warning information and return 2 to notify the users that the interface is removed. Signed-off-by: Wanpeng Li Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 +++----- kernel/sysctl_binary.c | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 97186b99b0e4..6502d35a25ba 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1101,11 +1101,9 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, }, { - .procname = "nr_pdflush_threads", - .data = &nr_pdflush_threads, - .maxlen = sizeof nr_pdflush_threads, - .mode = 0444 /* read-only*/, - .proc_handler = proc_dointvec, + .procname = "nr_pdflush_threads", + .mode = 0444 /* read-only */, + .proc_handler = pdflush_proc_obsolete, }, { .procname = "swappiness", diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index a650694883a1..65bdcf198d4e 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = { { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ - { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, + /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */ { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, /* VM_PAGEBUF unused */ /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ -- cgit v1.2.3 From c255a458055e459f65eb7b7f51dc5dbdd0caf1d8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 31 Jul 2012 16:43:02 -0700 Subject: memcg: rename config variables Sanity: CONFIG_CGROUP_MEM_RES_CTLR -> CONFIG_MEMCG CONFIG_CGROUP_MEM_RES_CTLR_SWAP -> CONFIG_MEMCG_SWAP CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED -> CONFIG_MEMCG_SWAP_ENABLED CONFIG_CGROUP_MEM_RES_CTLR_KMEM -> CONFIG_MEMCG_KMEM [mhocko@suse.cz: fix missed bits] Cc: Glauber Costa Acked-by: Michal Hocko Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Aneesh Kumar K.V Cc: David Rientjes Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index aaa8813c45d1..3bd2280d79f6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1306,7 +1306,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG p->memcg_batch.do_batch = 0; p->memcg_batch.memcg = NULL; #endif -- cgit v1.2.3 From 9adb62a5df9c0fbef7b4665919329f73a34651ed Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 31 Jul 2012 16:43:28 -0700 Subject: mm/hotplug: correctly setup fallback zonelists when creating new pgdat When hotadd_new_pgdat() is called to create new pgdat for a new node, a fallback zonelist should be created for the new node. There's code to try to achieve that in hotadd_new_pgdat() as below: /* * The node we allocated has no zone fallback lists. For avoiding * to access not-initialized zonelist, build here. */ mutex_lock(&zonelists_mutex); build_all_zonelists(pgdat, NULL); mutex_unlock(&zonelists_mutex); But it doesn't work as expected. When hotadd_new_pgdat() is called, the new node is still in offline state because node_set_online(nid) hasn't been called yet. And build_all_zonelists() only builds zonelists for online nodes as: for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); build_zonelist_cache(pgdat); } Though we hope to create zonelist for the new pgdat, but it doesn't. So add a new parameter "pgdat" the build_all_zonelists() to build pgdat for the new pgdat too. Signed-off-by: Jiang Liu Signed-off-by: Xishi Qiu Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Rusty Russell Cc: Yinghai Lu Cc: Tony Luck Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: David Rientjes Cc: Keping Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index a4eb5227a19e..14d32588cccd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu) if (pgdat->node_zonelists->_zonerefs->zone == NULL) { mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL); + build_all_zonelists(NULL, NULL); mutex_unlock(&zonelists_mutex); } #endif -- cgit v1.2.3 From 907aed48f65efeecf91575397e3d79335d93a466 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:07 -0700 Subject: mm: allow PF_MEMALLOC from softirq context This is needed to allow network softirq packet processing to make use of PF_MEMALLOC. Currently softirq context cannot use PF_MEMALLOC due to it not being associated with a task, and therefore not having task flags to fiddle with - thus the gfp to alloc flag mapping ignores the task flags when in interrupts (hard or soft) context. Allowing softirqs to make use of PF_MEMALLOC therefore requires some trickery. This patch borrows the task flags from whatever process happens to be preempted by the softirq. It then modifies the gfp to alloc flags mapping to not exclude task flags in softirq context, and modify the softirq code to save, clear and restore the PF_MEMALLOC flag. The save and clear, ensures the preempted task's PF_MEMALLOC flag doesn't leak into the softirq. The restore ensures a softirq's PF_MEMALLOC flag cannot leak back into the preempted process. This should be safe due to the following reasons Softirqs can run on multiple CPUs sure but the same task should not be executing the same softirq code. Neither should the softirq handler be preempted by any other softirq handler so the flags should not leak to an unrelated softirq. Softirqs re-enable hardware interrupts in __do_softirq() so can be preempted by hardware interrupts so PF_MEMALLOC is inherited by the hard IRQ. However, this is similar to a process in reclaim being preempted by a hardirq. While PF_MEMALLOC is set, gfp_to_alloc_flags() distinguishes between hard and soft irqs and avoids giving a hardirq the ALLOC_NO_WATERMARKS flag. If the softirq is deferred to ksoftirq then its flags may be used instead of a normal tasks but as the softirq cannot be preempted, the PF_MEMALLOC flag does not leak to other code by accident. [davem@davemloft.net: Document why PF_MEMALLOC is safe] Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Cc: David Miller Cc: Neil Brown Cc: Mike Christie Cc: Eric B Munson Cc: Eric Dumazet Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 671f9594e368..b73e681df09e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -210,6 +210,14 @@ asmlinkage void __do_softirq(void) __u32 pending; int max_restart = MAX_SOFTIRQ_RESTART; int cpu; + unsigned long old_flags = current->flags; + + /* + * Mask out PF_MEMALLOC s current task context is borrowed for the + * softirq. A softirq handled such as network RX might set PF_MEMALLOC + * again if the socket is related to swap + */ + current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); account_system_vtime(current); @@ -265,6 +273,7 @@ restart: account_system_vtime(current); __local_bh_enable(SOFTIRQ_OFFSET); + tsk_restore_flags(current, old_flags, PF_MEMALLOC); } #ifndef __ARCH_HAS_DO_SOFTIRQ -- cgit v1.2.3