From d34d2c97ae56961ca73fc8704aec2304bb820668 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 20 Jun 2017 12:12:49 +0100 Subject: cpufreq/sched: Use cpu max freq rather than policy max When we convert capacity into frequency, we used policy->max to get the max freq of the cpu. Since this can be changed by userspace policy or thermal events, we are potentially asking for a lower frequency than the utilization demands. Change over to using cpuinfo.max which is the max freq supported by that cpu rather than the currently-chosen max. Frequency granted still honours the max policy. Tested by setting a userspace policy and observing the relevant vars in a trace. In this instance, we ask for around 1ghz instead of 620MHz. freq_new=1013512 unfixed_freq_new=624487 capacity=546 cpuinfo_max=1900800 policy_max=1171200 Change-Id: I8c5694db42243c6fb78bb9be9046b06ac81295e7 Signed-off-by: Chris Redpath --- kernel/sched/cpufreq_sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 6ffb23adbcef..ec0aed7a8f96 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -202,7 +202,7 @@ static void update_fdomain_capacity_request(int cpu) } /* Convert the new maximum capacity request into a cpu frequency */ - freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; + freq_new = capacity * policy->cpuinfo.max_freq >> SCHED_CAPACITY_SHIFT; if (cpufreq_frequency_table_target(policy, policy->freq_table, freq_new, CPUFREQ_RELATION_L, &index_new)) -- cgit v1.2.3 From c9391ba6ee7930ccfda71748cec2d8d210510cc8 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Thu, 5 Oct 2017 17:00:57 -0400 Subject: FROMLIST: android: binder: Change binder_shrinker to static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (from https://patchwork.kernel.org/patch/9990321/) binder_shrinker struct is not used anywhere outside of binder_alloc.c and should be static. Bug: 63926541 Change-Id: I7a13d4ddbaaf3721cddfe1d860e34c7be80dd082 Acked-by: Arve Hjønnevåg Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index b95da16fd938..22cb64f8d96c 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -986,7 +986,7 @@ binder_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) return ret; } -struct shrinker binder_shrinker = { +static struct shrinker binder_shrinker = { .count_objects = binder_shrink_count, .scan_objects = binder_shrink_scan, .seeks = DEFAULT_SEEKS, -- cgit v1.2.3 From a7e1d33c7a1101936cf7f048a0ef7cac62e32cd0 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Thu, 5 Oct 2017 17:13:47 -0400 Subject: FROMLIST: android: binder: Fix null ptr dereference in debug msg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (from https://patchwork.kernel.org/patch/9990323/) Don't access next->data in kernel debug message when the next buffer is null. Bug: 36007193 Change-Id: Ib8240d7e9a7087a2256e88c0ae84b9df0f2d0224 Acked-by: Arve Hjønnevåg Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 22cb64f8d96c..e431cfcfb59b 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -561,7 +561,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc, binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: merge free, buffer %pK do not share page with %pK or %pK\n", alloc->pid, buffer->data, - prev->data, next->data); + prev->data, next ? next->data : NULL); binder_update_page_range(alloc, 0, buffer_start_page(buffer), buffer_start_page(buffer) + PAGE_SIZE); } -- cgit v1.2.3 From 774481506a83d305353330e44b8e0f6ad80a34a8 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Fri, 15 Sep 2017 08:25:32 +0800 Subject: cpufreq: schedutil: clamp util to CPU maximum capacity The code is to get the CPU util by accumulate different scheduling classes and when the total util value is larger than CPU capacity then it clamps util to CPU maximum capacity. So we can get correct util value when use PELT signal but if with WALT signal it misses to clamp util value. On the other hand, WALT doesn't accumulate different class utilization but it needs to applying boost margin for WALT signal the CPU util value is possible to be larger than CPU capacity; so this patch is to always clamp util to CPU maximum capacity. Change-Id: I05481ddbf20246bb9be15b6bd21b6ec039015ea8 Signed-off-by: Leo Yan --- kernel/sched/cpufreq_schedutil.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 28977799017b..d3765f0cb699 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -216,8 +216,9 @@ static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time) *util = boosted_cpu_util(cpu); if (likely(use_pelt())) - *util = min((*util + rt), max_cap); + *util = *util + rt; + *util = min(*util, max_cap); *max = max_cap; } -- cgit v1.2.3 From 89805266af7825f6b8ccb8ff23a8e3aec4418dea Mon Sep 17 00:00:00 2001 From: Dmitry Shmidt Date: Tue, 24 Oct 2017 12:42:10 -0700 Subject: Revert "UPSTREAM: efi/libstub/arm64: Set -fpie when building the EFI stub" It break boot with UEFI bootloader This reverts commit 2f2860a504a30a7645c6a0ec06767c5c7677a4ea. --- drivers/firmware/efi/libstub/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index d1f1bde85d3f..68bdd9f23e13 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -10,7 +10,7 @@ cflags-$(CONFIG_X86) += -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 \ -fPIC -fno-strict-aliasing -mno-red-zone \ -mno-mmx -mno-sse -cflags-$(CONFIG_ARM64) := $(subst -pg,,$(KBUILD_CFLAGS)) -fpie +cflags-$(CONFIG_ARM64) := $(subst -pg,,$(KBUILD_CFLAGS)) cflags-$(CONFIG_ARM) := $(subst -pg,,$(KBUILD_CFLAGS)) \ -fno-builtin -fpic -mno-single-pic-base -- cgit v1.2.3 From 4f8767d1ca307ab8c03889534a3c5a525ce7485d Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Wed, 25 Oct 2017 17:25:20 +0100 Subject: ANDROID: sched/fair: Select correct capacity state for energy_diff The util returned from group_max_util is not capped at the max util present in the group, so it can be larger than the capacity stored in the array. Ensure that when this happens, we always use the last entry in the array to fetch energy from. Tested with synthetics on Juno board. Bug: 38159576 Change-Id: I89fb52fb7e68fa3e682e308acc232596672d03f7 Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b1df3873b6fd..5cac6a77b2bc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5384,17 +5384,20 @@ long group_norm_util(struct energy_env *eenv, struct sched_group *sg) static int find_new_capacity(struct energy_env *eenv, const struct sched_group_energy * const sge) { - int idx; + int idx, max_idx = sge->nr_cap_states - 1; unsigned long util = group_max_util(eenv); + /* default is max_cap if we don't find a match */ + eenv->cap_idx = max_idx; + for (idx = 0; idx < sge->nr_cap_states; idx++) { - if (sge->cap_states[idx].cap >= util) + if (sge->cap_states[idx].cap >= util) { + eenv->cap_idx = idx; break; + } } - eenv->cap_idx = idx; - - return idx; + return eenv->cap_idx; } static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) -- cgit v1.2.3 From cc005cde424b9ae89e1b6fe47da4f2c4a827c73f Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Fri, 4 Aug 2017 10:17:00 -0700 Subject: UPSTREAM: arm64: compat: Remove leftover variable declaration (cherry picked from commit 82d24d114f249d919b918ff8eefde4117db8f088) Commit a1d5ebaf8ccd ("arm64: big-endian: don't treat code as data when copying sigret code") moved the 32-bit sigreturn trampoline code from the aarch32_sigret_code array to kuser32.S. The commit removed the array definition from signal32.c, but not its declaration in signal32.h. Remove the leftover declaration. Signed-off-by: Kevin Brodsky Signed-off-by: Mark Salyzyn Signed-off-by: Catalin Marinas Bug: 20045882 Bug: 63737556 Change-Id: Ic8a5f0e367f0ecd5c5ddd9e3885d0285f91cf89e --- arch/arm64/include/asm/signal32.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/include/asm/signal32.h b/arch/arm64/include/asm/signal32.h index eeaa97559bab..81abea0b7650 100644 --- a/arch/arm64/include/asm/signal32.h +++ b/arch/arm64/include/asm/signal32.h @@ -22,8 +22,6 @@ #define AARCH32_KERN_SIGRET_CODE_OFFSET 0x500 -extern const compat_ulong_t aarch32_sigret_code[6]; - int compat_setup_frame(int usig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs); int compat_setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, -- cgit v1.2.3 From 95317055df213596ee7c25836a6ac64bb56c3cb5 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Thu, 19 Oct 2017 15:04:46 +0200 Subject: ANDROID: binder: Add thread->process_todo flag. This flag determines whether the thread should currently process the work in the thread->todo worklist. The prime usecase for this is improving the performance of synchronous transactions: all synchronous transactions post a BR_TRANSACTION_COMPLETE to the calling thread, but there's no reason to return that command to userspace right away - userspace anyway needs to wait for the reply. Likewise, a synchronous transaction that contains a binder object can cause a BC_ACQUIRE/BC_INCREFS to be returned to userspace; since the caller must anyway hold a strong/weak ref for the duration of the call, postponing these commands until the reply comes in is not a problem. Note that this flag is not used to determine whether a thread can handle process work; a thread should never pick up process work when thread work is still pending. Before patch: ------------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------------ BM_sendVec_binderize/4 45959 ns 20288 ns 34351 BM_sendVec_binderize/8 45603 ns 20080 ns 34909 BM_sendVec_binderize/16 45528 ns 20113 ns 34863 BM_sendVec_binderize/32 45551 ns 20122 ns 34881 BM_sendVec_binderize/64 45701 ns 20183 ns 34864 BM_sendVec_binderize/128 45824 ns 20250 ns 34576 BM_sendVec_binderize/256 45695 ns 20171 ns 34759 BM_sendVec_binderize/512 45743 ns 20211 ns 34489 BM_sendVec_binderize/1024 46169 ns 20430 ns 34081 After patch: ------------------------------------------------------------------ Benchmark Time CPU Iterations ------------------------------------------------------------------ BM_sendVec_binderize/4 42939 ns 17262 ns 40653 BM_sendVec_binderize/8 42823 ns 17243 ns 40671 BM_sendVec_binderize/16 42898 ns 17243 ns 40594 BM_sendVec_binderize/32 42838 ns 17267 ns 40527 BM_sendVec_binderize/64 42854 ns 17249 ns 40379 BM_sendVec_binderize/128 42881 ns 17288 ns 40427 BM_sendVec_binderize/256 42917 ns 17297 ns 40429 BM_sendVec_binderize/512 43184 ns 17395 ns 40411 BM_sendVec_binderize/1024 43119 ns 17357 ns 40432 Signed-off-by: Martijn Coenen Change-Id: Ia70287066d62aba64e98ac44ff1214e37ca75693 --- drivers/android/binder.c | 144 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 100 insertions(+), 44 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 112e61a4806b..6dfcc80f3fc6 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -601,6 +601,8 @@ enum { * (protected by @proc->inner_lock) * @todo: list of work to do for this thread * (protected by @proc->inner_lock) + * @process_todo: whether work in @todo should be processed + * (protected by @proc->inner_lock) * @return_error: transaction errors reported by this thread * (only accessed by this thread) * @reply_error: transaction errors reported by target thread @@ -627,6 +629,7 @@ struct binder_thread { bool looper_need_return; /* can be written by other thread */ struct binder_transaction *transaction_stack; struct list_head todo; + bool process_todo; struct binder_error return_error; struct binder_error reply_error; wait_queue_head_t wait; @@ -814,6 +817,16 @@ static bool binder_worklist_empty(struct binder_proc *proc, return ret; } +/** + * binder_enqueue_work_ilocked() - Add an item to the work list + * @work: struct binder_work to add to list + * @target_list: list to add work to + * + * Adds the work to the specified list. Asserts that work + * is not already on a list. + * + * Requires the proc->inner_lock to be held. + */ static void binder_enqueue_work_ilocked(struct binder_work *work, struct list_head *target_list) @@ -824,22 +837,56 @@ binder_enqueue_work_ilocked(struct binder_work *work, } /** - * binder_enqueue_work() - Add an item to the work list - * @proc: binder_proc associated with list + * binder_enqueue_thread_work_ilocked_nowake() - Add thread work + * @thread: thread to queue work to * @work: struct binder_work to add to list - * @target_list: list to add work to * - * Adds the work to the specified list. Asserts that work - * is not already on a list. + * Adds the work to the todo list of the thread. Doesn't set the process_todo + * flag, which means that (if it wasn't already set) the thread will go to + * sleep without handling this work when it calls read. + * + * Requires the proc->inner_lock to be held. */ static void -binder_enqueue_work(struct binder_proc *proc, - struct binder_work *work, - struct list_head *target_list) +binder_enqueue_thread_work_ilocked_nowake(struct binder_thread *thread, + struct binder_work *work) { - binder_inner_proc_lock(proc); - binder_enqueue_work_ilocked(work, target_list); - binder_inner_proc_unlock(proc); + binder_enqueue_work_ilocked(work, &thread->todo); +} + +/** + * binder_enqueue_thread_work_ilocked() - Add an item to the thread work list + * @thread: thread to queue work to + * @work: struct binder_work to add to list + * + * Adds the work to the todo list of the thread, and enables processing + * of the todo queue. + * + * Requires the proc->inner_lock to be held. + */ +static void +binder_enqueue_thread_work_ilocked(struct binder_thread *thread, + struct binder_work *work) +{ + binder_enqueue_work_ilocked(work, &thread->todo); + thread->process_todo = true; +} + +/** + * binder_enqueue_thread_work() - Add an item to the thread work list + * @thread: thread to queue work to + * @work: struct binder_work to add to list + * + * Adds the work to the todo list of the thread, and enables processing + * of the todo queue. + */ +static void +binder_enqueue_thread_work(struct binder_thread *thread, + struct binder_work *work) +{ + binder_inner_proc_lock(thread->proc); + binder_enqueue_thread_work_ilocked(thread, work); + binder_inner_proc_unlock(thread->proc); } static void @@ -954,7 +1001,7 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd) static bool binder_has_work_ilocked(struct binder_thread *thread, bool do_proc_work) { - return !binder_worklist_empty_ilocked(&thread->todo) || + return thread->process_todo || thread->looper_need_return || (do_proc_work && !binder_worklist_empty_ilocked(&thread->proc->todo)); @@ -1371,6 +1418,17 @@ static int binder_inc_node_nilocked(struct binder_node *node, int strong, node->local_strong_refs++; if (!node->has_strong_ref && target_list) { binder_dequeue_work_ilocked(&node->work); + /* + * Note: this function is the only place where we queue + * directly to a thread->todo without using the + * corresponding binder_enqueue_thread_work() helper + * functions; in this case it's ok to not set the + * process_todo flag, since we know this node work will + * always be followed by other work that starts queue + * processing: in case of synchronous transactions, a + * BR_REPLY or BR_ERROR; in case of oneway + * transactions, a BR_TRANSACTION_COMPLETE. + */ binder_enqueue_work_ilocked(&node->work, target_list); } } else { @@ -1382,6 +1440,9 @@ static int binder_inc_node_nilocked(struct binder_node *node, int strong, node->debug_id); return -EINVAL; } + /* + * See comment above + */ binder_enqueue_work_ilocked(&node->work, target_list); } } @@ -2071,9 +2132,9 @@ static void binder_send_failed_reply(struct binder_transaction *t, binder_pop_transaction_ilocked(target_thread, t); if (target_thread->reply_error.cmd == BR_OK) { target_thread->reply_error.cmd = error_code; - binder_enqueue_work_ilocked( - &target_thread->reply_error.work, - &target_thread->todo); + binder_enqueue_thread_work_ilocked( + target_thread, + &target_thread->reply_error.work); wake_up_interruptible(&target_thread->wait); } else { WARN(1, "Unexpected reply error: %u\n", @@ -2712,11 +2773,10 @@ static bool binder_proc_transaction(struct binder_transaction *t, struct binder_proc *proc, struct binder_thread *thread) { - struct list_head *target_list = NULL; struct binder_node *node = t->buffer->target_node; struct binder_priority node_prio; bool oneway = !!(t->flags & TF_ONE_WAY); - bool wakeup = true; + bool pending_async = false; BUG_ON(!node); binder_node_lock(node); @@ -2726,8 +2786,7 @@ static bool binder_proc_transaction(struct binder_transaction *t, if (oneway) { BUG_ON(thread); if (node->has_async_transaction) { - target_list = &node->async_todo; - wakeup = false; + pending_async = true; } else { node->has_async_transaction = 1; } @@ -2741,22 +2800,20 @@ static bool binder_proc_transaction(struct binder_transaction *t, return false; } - if (!thread && !target_list) + if (!thread && !pending_async) thread = binder_select_thread_ilocked(proc); if (thread) { - target_list = &thread->todo; binder_transaction_priority(thread->task, t, node_prio, node->inherit_rt); - } else if (!target_list) { - target_list = &proc->todo; + binder_enqueue_thread_work_ilocked(thread, &t->work); + } else if (!pending_async) { + binder_enqueue_work_ilocked(&t->work, &proc->todo); } else { - BUG_ON(target_list != &node->async_todo); + binder_enqueue_work_ilocked(&t->work, &node->async_todo); } - binder_enqueue_work_ilocked(&t->work, target_list); - - if (wakeup) + if (!pending_async) binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */); binder_inner_proc_unlock(proc); @@ -3258,10 +3315,10 @@ static void binder_transaction(struct binder_proc *proc, } } tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; - binder_enqueue_work(proc, tcomplete, &thread->todo); t->work.type = BINDER_WORK_TRANSACTION; if (reply) { + binder_enqueue_thread_work(thread, tcomplete); binder_inner_proc_lock(target_proc); if (target_thread->is_dead) { binder_inner_proc_unlock(target_proc); @@ -3269,7 +3326,7 @@ static void binder_transaction(struct binder_proc *proc, } BUG_ON(t->buffer->async_transaction != 0); binder_pop_transaction_ilocked(target_thread, in_reply_to); - binder_enqueue_work_ilocked(&t->work, &target_thread->todo); + binder_enqueue_thread_work_ilocked(target_thread, &t->work); binder_inner_proc_unlock(target_proc); wake_up_interruptible_sync(&target_thread->wait); binder_restore_priority(current, in_reply_to->saved_priority); @@ -3277,6 +3334,7 @@ static void binder_transaction(struct binder_proc *proc, } else if (!(t->flags & TF_ONE_WAY)) { BUG_ON(t->buffer->async_transaction != 0); binder_inner_proc_lock(proc); + binder_enqueue_thread_work_ilocked_nowake(thread, tcomplete); t->need_reply = 1; t->from_parent = thread->transaction_stack; thread->transaction_stack = t; @@ -3290,6 +3348,7 @@ static void binder_transaction(struct binder_proc *proc, } else { BUG_ON(target_node == NULL); BUG_ON(t->buffer->async_transaction != 1); + binder_enqueue_thread_work(thread, tcomplete); if (!binder_proc_transaction(t, target_proc, NULL)) goto err_dead_proc_or_thread; } @@ -3369,15 +3428,11 @@ err_invalid_target_handle: if (in_reply_to) { binder_restore_priority(current, in_reply_to->saved_priority); thread->return_error.cmd = BR_TRANSACTION_COMPLETE; - binder_enqueue_work(thread->proc, - &thread->return_error.work, - &thread->todo); + binder_enqueue_thread_work(thread, &thread->return_error.work); binder_send_failed_reply(in_reply_to, return_error); } else { thread->return_error.cmd = return_error; - binder_enqueue_work(thread->proc, - &thread->return_error.work, - &thread->todo); + binder_enqueue_thread_work(thread, &thread->return_error.work); } } @@ -3681,10 +3736,9 @@ static int binder_thread_write(struct binder_proc *proc, WARN_ON(thread->return_error.cmd != BR_OK); thread->return_error.cmd = BR_ERROR; - binder_enqueue_work( - thread->proc, - &thread->return_error.work, - &thread->todo); + binder_enqueue_thread_work( + thread, + &thread->return_error.work); binder_debug( BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n", @@ -3764,9 +3818,9 @@ static int binder_thread_write(struct binder_proc *proc, if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) - binder_enqueue_work_ilocked( - &death->work, - &thread->todo); + binder_enqueue_thread_work_ilocked( + thread, + &death->work); else { binder_enqueue_work_ilocked( &death->work, @@ -3821,8 +3875,8 @@ static int binder_thread_write(struct binder_proc *proc, if (thread->looper & (BINDER_LOOPER_STATE_REGISTERED | BINDER_LOOPER_STATE_ENTERED)) - binder_enqueue_work_ilocked( - &death->work, &thread->todo); + binder_enqueue_thread_work_ilocked( + thread, &death->work); else { binder_enqueue_work_ilocked( &death->work, @@ -3996,6 +4050,8 @@ retry: break; } w = binder_dequeue_work_head_ilocked(list); + if (binder_worklist_empty_ilocked(&thread->todo)) + thread->process_todo = false; switch (w->type) { case BINDER_WORK_TRANSACTION: { -- cgit v1.2.3 From c8bc3e3a3ede641b48360ab0bd35eb316168008b Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Tue, 24 Oct 2017 16:37:39 +0200 Subject: ANDROID: binder: show high watermark of alloc->pages. Show the high watermark of the index into the alloc->pages array, to facilitate sizing the buffer on a per-process basis. Change-Id: I2b40cd16628e0ee45216c51dc9b3c5b0c862032e Signed-off-by: Martijn Coenen --- drivers/android/binder_alloc.c | 4 ++++ drivers/android/binder_alloc.h | 2 ++ 2 files changed, 6 insertions(+) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index e431cfcfb59b..3a4279d219f7 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -282,6 +282,9 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, goto err_vm_insert_page_failed; } + if (index + 1 > alloc->pages_high) + alloc->pages_high = index + 1; + trace_binder_alloc_page_end(alloc, index); /* vm_insert_page does not seem to increment the refcount */ } @@ -855,6 +858,7 @@ void binder_alloc_print_pages(struct seq_file *m, } mutex_unlock(&alloc->mutex); seq_printf(m, " pages: %d:%d:%d\n", active, lru, free); + seq_printf(m, " pages high watermark: %zu\n", alloc->pages_high); } /** diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index 2dd33b6df104..0b145307f1fd 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -92,6 +92,7 @@ struct binder_lru_page { * @pages: array of binder_lru_page * @buffer_size: size of address space specified via mmap * @pid: pid for associated binder_proc (invariant after init) + * @pages_high: high watermark of offset in @pages * * Bookkeeping structure for per-proc address space management for binder * buffers. It is normally initialized during binder_init() and binder_mmap() @@ -112,6 +113,7 @@ struct binder_alloc { size_t buffer_size; uint32_t buffer_free; int pid; + size_t pages_high; }; #ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST -- cgit v1.2.3 From abe43c97cae28a0b06f632e2fbadd0c20b8cca5e Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 16 Oct 2017 16:21:19 +0200 Subject: USB: devio: Revert "USB: devio: Don't corrupt user memory" commit 845d584f41eac3475c21e4a7d5e88d0f6e410cf7 upstream. Taking the uurb->buffer_length userspace passes in as a maximum for the actual urbs transfer_buffer_length causes 2 serious issues: 1) It breaks isochronous support for all userspace apps using libusb, as existing libusb versions pass in 0 for uurb->buffer_length, relying on the kernel using the lenghts of the usbdevfs_iso_packet_desc descriptors passed in added together as buffer length. This for example causes redirection of USB audio and Webcam's into virtual machines using qemu-kvm to no longer work. This is a userspace ABI break and as such must be reverted. Note that the original commit does not protect other users / the kernels memory, it only stops the userspace process making the call from shooting itself in the foot. 2) It may cause the kernel to program host controllers to DMA over random memory. Just as the devio code used to only look at the iso_packet_desc lenghts, the host drivers do the same, relying on the submitter of the urbs to make sure the entire buffer is large enough and not checking transfer_buffer_length. But the "USB: devio: Don't corrupt user memory" commit now takes the userspace provided uurb->buffer_length for the buffer-size while copying over the user-provided iso_packet_desc lengths 1:1, allowing the user to specify a small buffer size while programming the host controller to dma a lot more data. (Atleast the ohci, uhci, xhci and fhci drivers do not check transfer_buffer_length for isoc transfers.) This reverts commit fa1ed74eb1c2 ("USB: devio: Don't corrupt user memory") fixing both these issues. Cc: Dan Carpenter Signed-off-by: Hans de Goede Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/devio.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index bd9419213d06..873ba02d59e6 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1417,11 +1417,7 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb totlen += isopkt[u].length; } u *= sizeof(struct usb_iso_packet_descriptor); - if (totlen <= uurb->buffer_length) - uurb->buffer_length = totlen; - else - WARN_ONCE(1, "uurb->buffer_length is too short %d vs %d", - totlen, uurb->buffer_length); + uurb->buffer_length = totlen; break; default: -- cgit v1.2.3 From 9dff499d822660c6dbb2a407a7d85be26f87da07 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 18 Oct 2017 12:49:38 -0400 Subject: USB: core: fix out-of-bounds access bug in usb_get_bos_descriptor() commit 1c0edc3633b56000e18d82fc241e3995ca18a69e upstream. Andrey used the syzkaller fuzzer to find an out-of-bounds memory access in usb_get_bos_descriptor(). The code wasn't checking that the next usb_dev_cap_header structure could fit into the remaining buffer space. This patch fixes the error and also reduces the bNumDeviceCaps field in the header to match the actual number of capabilities found, in cases where there are fewer than expected. Reported-by: Andrey Konovalov Signed-off-by: Alan Stern Tested-by: Andrey Konovalov Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index d9d048fc9082..5172bec612eb 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -926,10 +926,12 @@ int usb_get_bos_descriptor(struct usb_device *dev) for (i = 0; i < num; i++) { buffer += length; cap = (struct usb_dev_cap_header *)buffer; - length = cap->bLength; - if (total_len < length) + if (total_len < sizeof(*cap) || total_len < cap->bLength) { + dev->bos->desc->bNumDeviceCaps = i; break; + } + length = cap->bLength; total_len -= length; if (cap->bDescriptorType != USB_DT_DEVICE_CAPABILITY) { -- cgit v1.2.3 From 4512d6503a4db9c5cad41c18a2671ffaa6cb2be9 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 12 Oct 2017 14:50:46 +0200 Subject: USB: serial: metro-usb: add MS7820 device id commit 31dc3f819bac28a0990b36510197560258ab7421 upstream. Add device-id entry for (Honeywell) Metrologic MS7820 bar code scanner. The device has two interfaces (in this mode?); a vendor-specific interface with two interrupt endpoints and a second HID interface, which we do not bind to. Reported-by: Ladislav Dobrovsky Tested-by: Ladislav Dobrovsky Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/metro-usb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/metro-usb.c b/drivers/usb/serial/metro-usb.c index 39e683096e94..45182c65fa1f 100644 --- a/drivers/usb/serial/metro-usb.c +++ b/drivers/usb/serial/metro-usb.c @@ -45,6 +45,7 @@ struct metrousb_private { static const struct usb_device_id id_table[] = { { USB_DEVICE(FOCUS_VENDOR_ID, FOCUS_PRODUCT_ID_BI) }, { USB_DEVICE(FOCUS_VENDOR_ID, FOCUS_PRODUCT_ID_UNI) }, + { USB_DEVICE_INTERFACE_CLASS(0x0c2e, 0x0730, 0xff) }, /* MS7820 */ { }, /* Terminating entry. */ }; MODULE_DEVICE_TABLE(usb, id_table); -- cgit v1.2.3 From d729f29a291f61fbb8d406eabfbc969c3f6cccc2 Mon Sep 17 00:00:00 2001 From: Maksim Salau Date: Wed, 11 Oct 2017 11:10:52 +0300 Subject: usb: cdc_acm: Add quirk for Elatec TWN3 commit 765fb2f181cad669f2beb87842a05d8071f2be85 upstream. Elatec TWN3 has the union descriptor on data interface. This results in failure to bind the device to the driver with the following log: usb 1-1.2: new full speed USB device using streamplug-ehci and address 4 usb 1-1.2: New USB device found, idVendor=09d8, idProduct=0320 usb 1-1.2: New USB device strings: Mfr=1, Product=2, SerialNumber=0 usb 1-1.2: Product: RFID Device (COM) usb 1-1.2: Manufacturer: OEM cdc_acm 1-1.2:1.0: Zero length descriptor references cdc_acm: probe of 1-1.2:1.0 failed with error -22 Adding the NO_UNION_NORMAL quirk for the device fixes the issue. `lsusb -v` of the device: Bus 001 Device 003: ID 09d8:0320 Device Descriptor: bLength 18 bDescriptorType 1 bcdUSB 2.00 bDeviceClass 2 Communications bDeviceSubClass 0 bDeviceProtocol 0 bMaxPacketSize0 32 idVendor 0x09d8 idProduct 0x0320 bcdDevice 3.00 iManufacturer 1 OEM iProduct 2 RFID Device (COM) iSerial 0 bNumConfigurations 1 Configuration Descriptor: bLength 9 bDescriptorType 2 wTotalLength 67 bNumInterfaces 2 bConfigurationValue 1 iConfiguration 0 bmAttributes 0x80 (Bus Powered) MaxPower 250mA Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 0 bAlternateSetting 0 bNumEndpoints 1 bInterfaceClass 2 Communications bInterfaceSubClass 2 Abstract (modem) bInterfaceProtocol 1 AT-commands (v.25ter) iInterface 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x83 EP 3 IN bmAttributes 3 Transfer Type Interrupt Synch Type None Usage Type Data wMaxPacketSize 0x0020 1x 32 bytes bInterval 2 Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 1 bAlternateSetting 0 bNumEndpoints 2 bInterfaceClass 10 CDC Data bInterfaceSubClass 0 Unused bInterfaceProtocol 0 iInterface 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x02 EP 2 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0020 1x 32 bytes bInterval 0 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x81 EP 1 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0020 1x 32 bytes bInterval 0 CDC Header: bcdCDC 1.10 CDC Call Management: bmCapabilities 0x03 call management use DataInterface bDataInterface 1 CDC ACM: bmCapabilities 0x06 sends break line coding and serial state CDC Union: bMasterInterface 0 bSlaveInterface 1 Device Status: 0x0000 (Bus Powered) Signed-off-by: Maksim Salau Acked-by: Oliver Neukum Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-acm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index df96f5f88c15..3f6bb3fff890 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -1762,6 +1762,9 @@ static const struct usb_device_id acm_ids[] = { { USB_DEVICE(0xfff0, 0x0100), /* DATECS FP-2000 */ .driver_info = NO_UNION_NORMAL, /* reports zero length descriptor */ }, + { USB_DEVICE(0x09d8, 0x0320), /* Elatec GmbH TWN3 */ + .driver_info = NO_UNION_NORMAL, /* has misplaced union descriptor */ + }, { USB_DEVICE(0x2912, 0x0001), /* ATOL FPrint */ .driver_info = CLEAR_HALT_CONDITIONS, -- cgit v1.2.3 From d012ab210f5f4ff4e6f5567fe80e1dcc82077c2d Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Tue, 3 Oct 2017 11:16:43 +0300 Subject: usb: quirks: add quirk for WORLDE MINI MIDI keyboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 2811501e6d8f5747d08f8e25b9ecf472d0dc4c7d upstream. This keyboard doesn't implement Get String descriptors properly even though string indexes are valid. What happens is that when requesting for the String descriptor, the device disconnects and reconnects. Without this quirk, this loop will continue forever. Cc: Alan Stern Reported-by: Владимир Мартьянов Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 82806e311202..a6aaf2f193a4 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -221,6 +221,10 @@ static const struct usb_device_id usb_quirk_list[] = { /* Corsair Strafe RGB */ { USB_DEVICE(0x1b1c, 0x1b20), .driver_info = USB_QUIRK_DELAY_INIT }, + /* MIDI keyboard WORLDE MINI */ + { USB_DEVICE(0x1c75, 0x0204), .driver_info = + USB_QUIRK_CONFIG_INTF_STRINGS }, + /* Acer C120 LED Projector */ { USB_DEVICE(0x1de1, 0xc102), .driver_info = USB_QUIRK_NO_LPM }, -- cgit v1.2.3 From 67e25805e74879eb40bffe7b8e660c3966ee113a Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 17 Oct 2017 16:07:33 +0300 Subject: usb: hub: Allow reset retry for USB2 devices on connect bounce commit 1ac7db63333db1eeff901bfd6bbcd502b4634fa4 upstream. If the connect status change is set during reset signaling, but the status remains connected just retry port reset. This solves an issue with connecting a 90W HP Thunderbolt 3 dock with a Lenovo Carbon x1 (5th generation) which causes a 30min loop of a high speed device being re-discovererd before usb ports starts working. [...] [ 389.023845] usb 3-1: new high-speed USB device number 55 using xhci_hcd [ 389.491841] usb 3-1: new high-speed USB device number 56 using xhci_hcd [ 389.959928] usb 3-1: new high-speed USB device number 57 using xhci_hcd [...] This is caused by a high speed device that doesn't successfully go to the enabled state after the second port reset. Instead the connection bounces (connected, with connect status change), bailing out completely from enumeration just to restart from scratch. Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1716332 Signed-off-by: Mathias Nyman Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 51bba58c0c3b..22e61786354a 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -2656,13 +2656,16 @@ static int hub_port_wait_reset(struct usb_hub *hub, int port1, if (!(portstatus & USB_PORT_STAT_CONNECTION)) return -ENOTCONN; - /* bomb out completely if the connection bounced. A USB 3.0 - * connection may bounce if multiple warm resets were issued, + /* Retry if connect change is set but status is still connected. + * A USB 3.0 connection may bounce if multiple warm resets were issued, * but the device may have successfully re-connected. Ignore it. */ if (!hub_is_superspeed(hub->hdev) && - (portchange & USB_PORT_STAT_C_CONNECTION)) - return -ENOTCONN; + (portchange & USB_PORT_STAT_C_CONNECTION)) { + usb_clear_port_feature(hub->hdev, port1, + USB_PORT_FEAT_C_CONNECTION); + return -EAGAIN; + } if (!(portstatus & USB_PORT_STAT_ENABLE)) return -EBUSY; -- cgit v1.2.3 From f7d8b2e150eeb16646e398f90d7506dc84d8a32d Mon Sep 17 00:00:00 2001 From: Jussi Laako Date: Sun, 15 Oct 2017 12:41:32 +0300 Subject: ALSA: usb-audio: Add native DSD support for Pro-Ject Pre Box S2 Digital commit 9bb201a5d5acc733943e8af7151cceab9d976a69 upstream. Add native DSD support quirk for Pro-Ject Pre Box S2 Digital USB id 2772:0230. Signed-off-by: Jussi Laako Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 1cc20d138dae..9c5368e7ee23 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1305,6 +1305,7 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip, case USB_ID(0x20b1, 0x2008): /* Matrix Audio X-Sabre */ case USB_ID(0x20b1, 0x300a): /* Matrix Audio Mini-i Pro */ case USB_ID(0x22d9, 0x0416): /* OPPO HA-1 */ + case USB_ID(0x2772, 0x0230): /* Pro-Ject Pre Box S2 Digital */ if (fp->altsetting == 2) return SNDRV_PCM_FMTBIT_DSD_U32_BE; break; -- cgit v1.2.3 From d68b07a19a9ed71a60ffe28cf0a8aa5988dcdc5e Mon Sep 17 00:00:00 2001 From: Wolfgang Grandegger Date: Thu, 14 Sep 2017 18:37:14 +0200 Subject: can: gs_usb: fix busy loop if no more TX context is available commit 97819f943063b622eca44d3644067c190dc75039 upstream. If sending messages with no cable connected, it quickly happens that there is no more TX context available. Then "gs_can_start_xmit()" returns with "NETDEV_TX_BUSY" and the upper layer does retry immediately keeping the CPU busy. To fix that issue, I moved "atomic_dec(&dev->active_tx_urbs)" from "gs_usb_xmit_callback()" to the TX done handling in "gs_usb_receive_bulk_callback()". Renaming "active_tx_urbs" to "active_tx_contexts" and moving it into "gs_[alloc|free]_tx_context()" would also make sense. Signed-off-by: Wolfgang Grandegger Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/usb/gs_usb.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index ae5709354546..27e2352fcc42 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -356,6 +356,8 @@ static void gs_usb_receive_bulk_callback(struct urb *urb) gs_free_tx_context(txc); + atomic_dec(&dev->active_tx_urbs); + netif_wake_queue(netdev); } @@ -444,14 +446,6 @@ static void gs_usb_xmit_callback(struct urb *urb) urb->transfer_buffer_length, urb->transfer_buffer, urb->transfer_dma); - - atomic_dec(&dev->active_tx_urbs); - - if (!netif_device_present(netdev)) - return; - - if (netif_queue_stopped(netdev)) - netif_wake_queue(netdev); } static netdev_tx_t gs_can_start_xmit(struct sk_buff *skb, struct net_device *netdev) -- cgit v1.2.3 From 583a4219841d00e96b5de55be160aa7eb7721a4d Mon Sep 17 00:00:00 2001 From: Jonathan Liu Date: Mon, 9 Oct 2017 22:46:13 -0500 Subject: usb: musb: sunxi: Explicitly release USB PHY on exit commit 6ed05c68cbcae42cd52b8e53b66952bfa9c002ce upstream. This fixes a kernel oops when unloading the driver due to usb_put_phy being called after usb_phy_generic_unregister when the device is detached. Calling usb_phy_generic_unregister causes x->dev->driver to be NULL in usb_put_phy and results in a NULL pointer dereference. Signed-off-by: Jonathan Liu Signed-off-by: Bin Liu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/musb/sunxi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/musb/sunxi.c b/drivers/usb/musb/sunxi.c index d9b0dc461439..2d3be66fb563 100644 --- a/drivers/usb/musb/sunxi.c +++ b/drivers/usb/musb/sunxi.c @@ -320,6 +320,8 @@ static int sunxi_musb_exit(struct musb *musb) if (test_bit(SUNXI_MUSB_FL_HAS_SRAM, &glue->flags)) sunxi_sram_release(musb->controller->parent); + devm_usb_put_phy(glue->dev, glue->xceiv); + return 0; } -- cgit v1.2.3 From aa629364c0e3c5ed3d21690a3a370601ac66eaee Mon Sep 17 00:00:00 2001 From: Jonathan Liu Date: Mon, 9 Oct 2017 22:46:12 -0500 Subject: usb: musb: Check for host-mode using is_host_active() on reset interrupt commit 445ef61543da3db5b699f87fb0aa4f227165f6ed upstream. The sunxi musb has a bug where sometimes it will generate a babble error on device disconnect instead of a disconnect IRQ. When this happens the musb controller switches from host mode to device mode (it clears MUSB_DEVCTL_HM/MUSB_DEVCTL_SESSION and sets MUSB_DEVCTL_BDEVICE) and gets stuck in this state. The babble error is misdetected as a bus reset because MUSB_DEVCTL_HM was cleared. To fix this, use is_host_active() rather than (devctl & MUSB_DEVCTL_HM) to detect babble error so that sunxi musb babble recovery can handle it by restoring the mode. This information is provided by the driver logic and does not rely on register contents. Signed-off-by: Jonathan Liu Signed-off-by: Bin Liu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/musb/musb_core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/usb/musb/musb_core.c b/drivers/usb/musb/musb_core.c index 00eed5d66fda..06d83825923a 100644 --- a/drivers/usb/musb/musb_core.c +++ b/drivers/usb/musb/musb_core.c @@ -877,7 +877,7 @@ b_host: */ if (int_usb & MUSB_INTR_RESET) { handled = IRQ_HANDLED; - if (devctl & MUSB_DEVCTL_HM) { + if (is_host_active(musb)) { /* * When BABBLE happens what we can depends on which * platform MUSB is running, because some platforms @@ -887,9 +887,7 @@ b_host: * drop the session. */ dev_err(musb->controller, "Babble\n"); - - if (is_host_active(musb)) - musb_recover_from_babble(musb); + musb_recover_from_babble(musb); } else { dev_dbg(musb->controller, "BUS RESET as %s\n", usb_otg_state_string(musb->xceiv->otg->state)); -- cgit v1.2.3 From 4d56587c28d44308e5db275b34e5200fab2cc2f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20M=C3=A4tje?= Date: Wed, 18 Oct 2017 13:25:17 +0200 Subject: can: esd_usb2: Fix can_dlc value for received RTR, frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 72d92e865d1560723e1957ee3f393688c49ca5bf upstream. The dlc member of the struct rx_msg contains also the ESD_RTR flag to mark received RTR frames. Without the fix the can_dlc value for received RTR frames would always be set to 8 by get_can_dlc() instead of the received value. Fixes: 96d8e90382dc ("can: Add driver for esd CAN-USB/2 device") Signed-off-by: Stefan Mätje Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/usb/esd_usb2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/usb/esd_usb2.c b/drivers/net/can/usb/esd_usb2.c index 113e64fcd73b..4c6707ecc619 100644 --- a/drivers/net/can/usb/esd_usb2.c +++ b/drivers/net/can/usb/esd_usb2.c @@ -333,7 +333,7 @@ static void esd_usb2_rx_can_msg(struct esd_usb2_net_priv *priv, } cf->can_id = id & ESD_IDMASK; - cf->can_dlc = get_can_dlc(msg->msg.rx.dlc); + cf->can_dlc = get_can_dlc(msg->msg.rx.dlc & ~ESD_RTR); if (id & ESD_EXTID) cf->can_id |= CAN_EFF_FLAG; -- cgit v1.2.3 From 195674adee572b97761501f3906e4cc0e290891f Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Sun, 1 Oct 2017 13:52:43 -0400 Subject: drm/nouveau/bsp/g92: disable by default commit 194d68dd051c2dd5ac2b522ae16100e774e8d869 upstream. G92's seem to require some additional bit of initialization before the BSP engine can work. It feels like clocks are not set up for the underlying VLD engine, which means that all commands submitted to the xtensa chip end up hanging. VP seems to work fine though. This still allows people to force-enable the bsp engine if they want to play around with it, but makes it harder for the card to hang by default. Signed-off-by: Ilia Mirkin Signed-off-by: Ben Skeggs Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/nouveau/nvkm/engine/bsp/g84.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/bsp/g84.c b/drivers/gpu/drm/nouveau/nvkm/engine/bsp/g84.c index 3ef01071f073..103471ff4dc4 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/bsp/g84.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/bsp/g84.c @@ -40,5 +40,5 @@ int g84_bsp_new(struct nvkm_device *device, int index, struct nvkm_engine **pengine) { return nvkm_xtensa_new_(&g84_bsp, device, index, - true, 0x103000, pengine); + device->chipset != 0x92, 0x103000, pengine); } -- cgit v1.2.3 From 4516069f1b05d3e84f5e11a8490ee0ca09997731 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Mon, 25 Sep 2017 15:05:38 +1000 Subject: drm/nouveau/mmu: flush tlbs before deleting page tables commit 77913bbcb43ac9a07a6fe849c2fd3bf85fc8bdd8 upstream. Even though we've zeroed the PDE, the GPU may have cached the PD, so we need to flush when deleting them. Noticed while working on replacement MMU code, but a backport might be a good idea, so let's fix it in the current code too. Signed-off-by: Ben Skeggs Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/nouveau/nvkm/subdev/mmu/base.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/base.c index e04a2296ecd0..5bb7f7e0f11f 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/base.c @@ -240,6 +240,8 @@ nvkm_vm_unmap_pgt(struct nvkm_vm *vm, int big, u32 fpde, u32 lpde) mmu->func->map_pgt(vpgd->obj, pde, vpgt->mem); } + mmu->func->flush(vm); + nvkm_memory_del(&pgt); } } -- cgit v1.2.3 From f9e937124ec22a0c36c45a2d07d693492a46471e Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 18 Oct 2017 00:45:49 +0100 Subject: ALSA: seq: Enable 'use' locking in all configurations commit 8009d506a1dd00cf436b0c4cca0dcec130580a21 upstream. The 'use' locking macros are no-ops if neither SMP or SND_DEBUG is enabled. This might once have been OK in non-preemptible configurations, but even in that case snd_seq_read() may sleep while relying on a 'use' lock. So always use the proper implementations. Signed-off-by: Ben Hutchings Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/seq/seq_lock.c | 4 ---- sound/core/seq/seq_lock.h | 12 ------------ 2 files changed, 16 deletions(-) diff --git a/sound/core/seq/seq_lock.c b/sound/core/seq/seq_lock.c index 12ba83367b1b..ba5752ee9af3 100644 --- a/sound/core/seq/seq_lock.c +++ b/sound/core/seq/seq_lock.c @@ -23,8 +23,6 @@ #include #include "seq_lock.h" -#if defined(CONFIG_SMP) || defined(CONFIG_SND_DEBUG) - /* wait until all locks are released */ void snd_use_lock_sync_helper(snd_use_lock_t *lockp, const char *file, int line) { @@ -42,5 +40,3 @@ void snd_use_lock_sync_helper(snd_use_lock_t *lockp, const char *file, int line) } EXPORT_SYMBOL(snd_use_lock_sync_helper); - -#endif diff --git a/sound/core/seq/seq_lock.h b/sound/core/seq/seq_lock.h index 54044bc2c9ef..ac38031c370e 100644 --- a/sound/core/seq/seq_lock.h +++ b/sound/core/seq/seq_lock.h @@ -3,8 +3,6 @@ #include -#if defined(CONFIG_SMP) || defined(CONFIG_SND_DEBUG) - typedef atomic_t snd_use_lock_t; /* initialize lock */ @@ -20,14 +18,4 @@ typedef atomic_t snd_use_lock_t; void snd_use_lock_sync_helper(snd_use_lock_t *lock, const char *file, int line); #define snd_use_lock_sync(lockp) snd_use_lock_sync_helper(lockp, __BASE_FILE__, __LINE__) -#else /* SMP || CONFIG_SND_DEBUG */ - -typedef spinlock_t snd_use_lock_t; /* dummy */ -#define snd_use_lock_init(lockp) /**/ -#define snd_use_lock_use(lockp) /**/ -#define snd_use_lock_free(lockp) /**/ -#define snd_use_lock_sync(lockp) /**/ - -#endif /* SMP || CONFIG_SND_DEBUG */ - #endif /* __SND_SEQ_LOCK_H */ -- cgit v1.2.3 From 68c610776cfb1dc1f541d88b17918a4182adf392 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 17 Oct 2017 11:58:17 +0200 Subject: ALSA: hda: Remove superfluous '-' added by printk conversion commit 6bf88a343db2b3c160edf9b82a74966b31cc80bd upstream. While converting the error messages to the standard macros in the commit 4e76a8833fac ("ALSA: hda - Replace with standard printk"), a superfluous '-' slipped in the code mistakenly. Its influence is almost negligible, merely shows a dB value as negative integer instead of positive integer (or vice versa) in the rare error message. So let's kill this embarrassing byte to show more correct value. Fixes: 4e76a8833fac ("ALSA: hda - Replace with standard printk") Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/hda_codec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index 83741887faa1..3324f98c35f6 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -1755,7 +1755,7 @@ static int get_kctl_0dB_offset(struct hda_codec *codec, return -1; if (*step_to_check && *step_to_check != step) { codec_err(codec, "Mismatching dB step for vmaster slave (%d!=%d)\n", -- *step_to_check, step); + *step_to_check, step); return -1; } *step_to_check = step; -- cgit v1.2.3 From efdcbffb2b16da90eb5446a681cd3082b9b027f0 Mon Sep 17 00:00:00 2001 From: Pontus Andersson Date: Mon, 2 Oct 2017 14:45:19 +0200 Subject: i2c: ismt: Separate I2C block read from SMBus block read commit c6ebcedbab7ca78984959386012a17b21183e1a3 upstream. Commit b6c159a9cb69 ("i2c: ismt: Don't duplicate the receive length for block reads") broke I2C block reads. It aimed to fix normal SMBus block read, but changed the correct behavior of I2C block read in the process. According to Documentation/i2c/smbus-protocol, one vital difference between normal SMBus block read and I2C block read is that there is no byte count prefixed in the data sent on the wire: SMBus Block Read: i2c_smbus_read_block_data() S Addr Wr [A] Comm [A] S Addr Rd [A] [Count] A [Data] A [Data] A ... A [Data] NA P I2C Block Read: i2c_smbus_read_i2c_block_data() S Addr Wr [A] Comm [A] S Addr Rd [A] [Data] A [Data] A ... A [Data] NA P Therefore the two transaction types need to be processed differently in the driver by copying of the dma_buffer as done previously for the I2C_SMBUS_I2C_BLOCK_DATA case. Fixes: b6c159a9cb69 ("i2c: ismt: Don't duplicate the receive length for block reads") Signed-off-by: Pontus Andersson Tested-by: Stephen Douthit Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-ismt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-ismt.c b/drivers/i2c/busses/i2c-ismt.c index 639d1a9c8793..1111cb966a44 100644 --- a/drivers/i2c/busses/i2c-ismt.c +++ b/drivers/i2c/busses/i2c-ismt.c @@ -338,12 +338,15 @@ static int ismt_process_desc(const struct ismt_desc *desc, data->word = dma_buffer[0] | (dma_buffer[1] << 8); break; case I2C_SMBUS_BLOCK_DATA: - case I2C_SMBUS_I2C_BLOCK_DATA: if (desc->rxbytes != dma_buffer[0] + 1) return -EMSGSIZE; memcpy(data->block, dma_buffer, desc->rxbytes); break; + case I2C_SMBUS_I2C_BLOCK_DATA: + memcpy(&data->block[1], dma_buffer, desc->rxbytes); + data->block[0] = desc->rxbytes; + break; } return 0; } -- cgit v1.2.3 From 260b6739e8b7ce23ef5c73b99b0c399083e72db9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Sep 2017 23:29:12 +0200 Subject: brcmsmac: make some local variables 'static const' to reduce stack size commit c503dd38f850be28867ef7a42d9abe5ade81a9bd upstream. With KASAN and a couple of other patches applied, this driver is one of the few remaining ones that actually use more than 2048 bytes of kernel stack: broadcom/brcm80211/brcmsmac/phy/phy_n.c: In function 'wlc_phy_workarounds_nphy_gainctrl': broadcom/brcm80211/brcmsmac/phy/phy_n.c:16065:1: warning: the frame size of 3264 bytes is larger than 2048 bytes [-Wframe-larger-than=] broadcom/brcm80211/brcmsmac/phy/phy_n.c: In function 'wlc_phy_workarounds_nphy': broadcom/brcm80211/brcmsmac/phy/phy_n.c:17138:1: warning: the frame size of 2864 bytes is larger than 2048 bytes [-Wframe-larger-than=] Here, I'm reducing the stack size by marking as many local variables as 'static const' as I can without changing the actual code. This is the first of three patches to improve the stack usage in this driver. It would be good to have this backported to stabl kernels to get all drivers in 'allmodconfig' below the 2048 byte limit so we can turn on the frame warning again globally, but I realize that the patch is larger than the normal limit for stable backports. The other two patches do not need to be backported. Acked-by: Arend van Spriel Signed-off-by: Arnd Bergmann Signed-off-by: Kalle Valo Signed-off-by: Greg Kroah-Hartman --- .../net/wireless/brcm80211/brcmsmac/phy/phy_n.c | 197 ++++++++++----------- 1 file changed, 97 insertions(+), 100 deletions(-) diff --git a/drivers/net/wireless/brcm80211/brcmsmac/phy/phy_n.c b/drivers/net/wireless/brcm80211/brcmsmac/phy/phy_n.c index 99dac9b8a082..c75bfd3f8cb3 100644 --- a/drivers/net/wireless/brcm80211/brcmsmac/phy/phy_n.c +++ b/drivers/net/wireless/brcm80211/brcmsmac/phy/phy_n.c @@ -14764,8 +14764,8 @@ static void wlc_phy_ipa_restore_tx_digi_filts_nphy(struct brcms_phy *pi) } static void -wlc_phy_set_rfseq_nphy(struct brcms_phy *pi, u8 cmd, u8 *events, u8 *dlys, - u8 len) +wlc_phy_set_rfseq_nphy(struct brcms_phy *pi, u8 cmd, const u8 *events, + const u8 *dlys, u8 len) { u32 t1_offset, t2_offset; u8 ctr; @@ -15240,16 +15240,16 @@ static void wlc_phy_workarounds_nphy_gainctrl_2057_rev5(struct brcms_phy *pi) static void wlc_phy_workarounds_nphy_gainctrl_2057_rev6(struct brcms_phy *pi) { u16 currband; - s8 lna1G_gain_db_rev7[] = { 9, 14, 19, 24 }; - s8 *lna1_gain_db = NULL; - s8 *lna1_gain_db_2 = NULL; - s8 *lna2_gain_db = NULL; - s8 tiaA_gain_db_rev7[] = { -9, -6, -3, 0, 3, 3, 3, 3, 3, 3 }; - s8 *tia_gain_db; - s8 tiaA_gainbits_rev7[] = { 0, 1, 2, 3, 4, 4, 4, 4, 4, 4 }; - s8 *tia_gainbits; - u16 rfseqA_init_gain_rev7[] = { 0x624f, 0x624f }; - u16 *rfseq_init_gain; + static const s8 lna1G_gain_db_rev7[] = { 9, 14, 19, 24 }; + const s8 *lna1_gain_db = NULL; + const s8 *lna1_gain_db_2 = NULL; + const s8 *lna2_gain_db = NULL; + static const s8 tiaA_gain_db_rev7[] = { -9, -6, -3, 0, 3, 3, 3, 3, 3, 3 }; + const s8 *tia_gain_db; + static const s8 tiaA_gainbits_rev7[] = { 0, 1, 2, 3, 4, 4, 4, 4, 4, 4 }; + const s8 *tia_gainbits; + static const u16 rfseqA_init_gain_rev7[] = { 0x624f, 0x624f }; + const u16 *rfseq_init_gain; u16 init_gaincode; u16 clip1hi_gaincode; u16 clip1md_gaincode = 0; @@ -15310,10 +15310,9 @@ static void wlc_phy_workarounds_nphy_gainctrl_2057_rev6(struct brcms_phy *pi) if ((freq <= 5080) || (freq == 5825)) { - s8 lna1A_gain_db_rev7[] = { 11, 16, 20, 24 }; - s8 lna1A_gain_db_2_rev7[] = { - 11, 17, 22, 25}; - s8 lna2A_gain_db_rev7[] = { -1, 6, 10, 14 }; + static const s8 lna1A_gain_db_rev7[] = { 11, 16, 20, 24 }; + static const s8 lna1A_gain_db_2_rev7[] = { 11, 17, 22, 25}; + static const s8 lna2A_gain_db_rev7[] = { -1, 6, 10, 14 }; crsminu_th = 0x3e; lna1_gain_db = lna1A_gain_db_rev7; @@ -15321,10 +15320,9 @@ static void wlc_phy_workarounds_nphy_gainctrl_2057_rev6(struct brcms_phy *pi) lna2_gain_db = lna2A_gain_db_rev7; } else if ((freq >= 5500) && (freq <= 5700)) { - s8 lna1A_gain_db_rev7[] = { 11, 17, 21, 25 }; - s8 lna1A_gain_db_2_rev7[] = { - 12, 18, 22, 26}; - s8 lna2A_gain_db_rev7[] = { 1, 8, 12, 16 }; + static const s8 lna1A_gain_db_rev7[] = { 11, 17, 21, 25 }; + static const s8 lna1A_gain_db_2_rev7[] = { 12, 18, 22, 26}; + static const s8 lna2A_gain_db_rev7[] = { 1, 8, 12, 16 }; crsminu_th = 0x45; clip1md_gaincode_B = 0x14; @@ -15335,10 +15333,9 @@ static void wlc_phy_workarounds_nphy_gainctrl_2057_rev6(struct brcms_phy *pi) lna2_gain_db = lna2A_gain_db_rev7; } else { - s8 lna1A_gain_db_rev7[] = { 12, 18, 22, 26 }; - s8 lna1A_gain_db_2_rev7[] = { - 12, 18, 22, 26}; - s8 lna2A_gain_db_rev7[] = { -1, 6, 10, 14 }; + static const s8 lna1A_gain_db_rev7[] = { 12, 18, 22, 26 }; + static const s8 lna1A_gain_db_2_rev7[] = { 12, 18, 22, 26}; + static const s8 lna2A_gain_db_rev7[] = { -1, 6, 10, 14 }; crsminu_th = 0x41; lna1_gain_db = lna1A_gain_db_rev7; @@ -15450,65 +15447,65 @@ static void wlc_phy_workarounds_nphy_gainctrl(struct brcms_phy *pi) NPHY_RFSEQ_CMD_CLR_HIQ_DIS, NPHY_RFSEQ_CMD_SET_HPF_BW }; - u8 rfseq_updategainu_dlys[] = { 10, 30, 1 }; - s8 lna1G_gain_db[] = { 7, 11, 16, 23 }; - s8 lna1G_gain_db_rev4[] = { 8, 12, 17, 25 }; - s8 lna1G_gain_db_rev5[] = { 9, 13, 18, 26 }; - s8 lna1G_gain_db_rev6[] = { 8, 13, 18, 25 }; - s8 lna1G_gain_db_rev6_224B0[] = { 10, 14, 19, 27 }; - s8 lna1A_gain_db[] = { 7, 11, 17, 23 }; - s8 lna1A_gain_db_rev4[] = { 8, 12, 18, 23 }; - s8 lna1A_gain_db_rev5[] = { 6, 10, 16, 21 }; - s8 lna1A_gain_db_rev6[] = { 6, 10, 16, 21 }; - s8 *lna1_gain_db = NULL; - s8 lna2G_gain_db[] = { -5, 6, 10, 14 }; - s8 lna2G_gain_db_rev5[] = { -3, 7, 11, 16 }; - s8 lna2G_gain_db_rev6[] = { -5, 6, 10, 14 }; - s8 lna2G_gain_db_rev6_224B0[] = { -5, 6, 10, 15 }; - s8 lna2A_gain_db[] = { -6, 2, 6, 10 }; - s8 lna2A_gain_db_rev4[] = { -5, 2, 6, 10 }; - s8 lna2A_gain_db_rev5[] = { -7, 0, 4, 8 }; - s8 lna2A_gain_db_rev6[] = { -7, 0, 4, 8 }; - s8 *lna2_gain_db = NULL; - s8 tiaG_gain_db[] = { + static const u8 rfseq_updategainu_dlys[] = { 10, 30, 1 }; + static const s8 lna1G_gain_db[] = { 7, 11, 16, 23 }; + static const s8 lna1G_gain_db_rev4[] = { 8, 12, 17, 25 }; + static const s8 lna1G_gain_db_rev5[] = { 9, 13, 18, 26 }; + static const s8 lna1G_gain_db_rev6[] = { 8, 13, 18, 25 }; + static const s8 lna1G_gain_db_rev6_224B0[] = { 10, 14, 19, 27 }; + static const s8 lna1A_gain_db[] = { 7, 11, 17, 23 }; + static const s8 lna1A_gain_db_rev4[] = { 8, 12, 18, 23 }; + static const s8 lna1A_gain_db_rev5[] = { 6, 10, 16, 21 }; + static const s8 lna1A_gain_db_rev6[] = { 6, 10, 16, 21 }; + const s8 *lna1_gain_db = NULL; + static const s8 lna2G_gain_db[] = { -5, 6, 10, 14 }; + static const s8 lna2G_gain_db_rev5[] = { -3, 7, 11, 16 }; + static const s8 lna2G_gain_db_rev6[] = { -5, 6, 10, 14 }; + static const s8 lna2G_gain_db_rev6_224B0[] = { -5, 6, 10, 15 }; + static const s8 lna2A_gain_db[] = { -6, 2, 6, 10 }; + static const s8 lna2A_gain_db_rev4[] = { -5, 2, 6, 10 }; + static const s8 lna2A_gain_db_rev5[] = { -7, 0, 4, 8 }; + static const s8 lna2A_gain_db_rev6[] = { -7, 0, 4, 8 }; + const s8 *lna2_gain_db = NULL; + static const s8 tiaG_gain_db[] = { 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A }; - s8 tiaA_gain_db[] = { + static const s8 tiaA_gain_db[] = { 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13 }; - s8 tiaA_gain_db_rev4[] = { + static const s8 tiaA_gain_db_rev4[] = { 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d }; - s8 tiaA_gain_db_rev5[] = { + static const s8 tiaA_gain_db_rev5[] = { 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d }; - s8 tiaA_gain_db_rev6[] = { + static const s8 tiaA_gain_db_rev6[] = { 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d }; - s8 *tia_gain_db; - s8 tiaG_gainbits[] = { + const s8 *tia_gain_db; + static const s8 tiaG_gainbits[] = { 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 }; - s8 tiaA_gainbits[] = { + static const s8 tiaA_gainbits[] = { 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 }; - s8 tiaA_gainbits_rev4[] = { + static const s8 tiaA_gainbits_rev4[] = { 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 }; - s8 tiaA_gainbits_rev5[] = { + static const s8 tiaA_gainbits_rev5[] = { 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 }; - s8 tiaA_gainbits_rev6[] = { + static const s8 tiaA_gainbits_rev6[] = { 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 }; - s8 *tia_gainbits; - s8 lpf_gain_db[] = { 0x00, 0x06, 0x0c, 0x12, 0x12, 0x12 }; - s8 lpf_gainbits[] = { 0x00, 0x01, 0x02, 0x03, 0x03, 0x03 }; - u16 rfseqG_init_gain[] = { 0x613f, 0x613f, 0x613f, 0x613f }; - u16 rfseqG_init_gain_rev4[] = { 0x513f, 0x513f, 0x513f, 0x513f }; - u16 rfseqG_init_gain_rev5[] = { 0x413f, 0x413f, 0x413f, 0x413f }; - u16 rfseqG_init_gain_rev5_elna[] = { + const s8 *tia_gainbits; + static const s8 lpf_gain_db[] = { 0x00, 0x06, 0x0c, 0x12, 0x12, 0x12 }; + static const s8 lpf_gainbits[] = { 0x00, 0x01, 0x02, 0x03, 0x03, 0x03 }; + static const u16 rfseqG_init_gain[] = { 0x613f, 0x613f, 0x613f, 0x613f }; + static const u16 rfseqG_init_gain_rev4[] = { 0x513f, 0x513f, 0x513f, 0x513f }; + static const u16 rfseqG_init_gain_rev5[] = { 0x413f, 0x413f, 0x413f, 0x413f }; + static const u16 rfseqG_init_gain_rev5_elna[] = { 0x013f, 0x013f, 0x013f, 0x013f }; - u16 rfseqG_init_gain_rev6[] = { 0x513f, 0x513f }; - u16 rfseqG_init_gain_rev6_224B0[] = { 0x413f, 0x413f }; - u16 rfseqG_init_gain_rev6_elna[] = { 0x113f, 0x113f }; - u16 rfseqA_init_gain[] = { 0x516f, 0x516f, 0x516f, 0x516f }; - u16 rfseqA_init_gain_rev4[] = { 0x614f, 0x614f, 0x614f, 0x614f }; - u16 rfseqA_init_gain_rev4_elna[] = { + static const u16 rfseqG_init_gain_rev6[] = { 0x513f, 0x513f }; + static const u16 rfseqG_init_gain_rev6_224B0[] = { 0x413f, 0x413f }; + static const u16 rfseqG_init_gain_rev6_elna[] = { 0x113f, 0x113f }; + static const u16 rfseqA_init_gain[] = { 0x516f, 0x516f, 0x516f, 0x516f }; + static const u16 rfseqA_init_gain_rev4[] = { 0x614f, 0x614f, 0x614f, 0x614f }; + static const u16 rfseqA_init_gain_rev4_elna[] = { 0x314f, 0x314f, 0x314f, 0x314f }; - u16 rfseqA_init_gain_rev5[] = { 0x714f, 0x714f, 0x714f, 0x714f }; - u16 rfseqA_init_gain_rev6[] = { 0x714f, 0x714f }; - u16 *rfseq_init_gain; + static const u16 rfseqA_init_gain_rev5[] = { 0x714f, 0x714f, 0x714f, 0x714f }; + static const u16 rfseqA_init_gain_rev6[] = { 0x714f, 0x714f }; + const u16 *rfseq_init_gain; u16 initG_gaincode = 0x627e; u16 initG_gaincode_rev4 = 0x527e; u16 initG_gaincode_rev5 = 0x427e; @@ -15538,10 +15535,10 @@ static void wlc_phy_workarounds_nphy_gainctrl(struct brcms_phy *pi) u16 clip1mdA_gaincode_rev6 = 0x2084; u16 clip1md_gaincode = 0; u16 clip1loG_gaincode = 0x0074; - u16 clip1loG_gaincode_rev5[] = { + static const u16 clip1loG_gaincode_rev5[] = { 0x0062, 0x0064, 0x006a, 0x106a, 0x106c, 0x1074, 0x107c, 0x207c }; - u16 clip1loG_gaincode_rev6[] = { + static const u16 clip1loG_gaincode_rev6[] = { 0x106a, 0x106c, 0x1074, 0x107c, 0x007e, 0x107e, 0x207e, 0x307e }; u16 clip1loG_gaincode_rev6_224B0 = 0x1074; @@ -16066,7 +16063,7 @@ static void wlc_phy_workarounds_nphy_gainctrl(struct brcms_phy *pi) static void wlc_phy_workarounds_nphy(struct brcms_phy *pi) { - u8 rfseq_rx2tx_events[] = { + static const u8 rfseq_rx2tx_events[] = { NPHY_RFSEQ_CMD_NOP, NPHY_RFSEQ_CMD_RXG_FBW, NPHY_RFSEQ_CMD_TR_SWITCH, @@ -16076,7 +16073,7 @@ static void wlc_phy_workarounds_nphy(struct brcms_phy *pi) NPHY_RFSEQ_CMD_EXT_PA }; u8 rfseq_rx2tx_dlys[] = { 8, 6, 6, 2, 4, 60, 1 }; - u8 rfseq_tx2rx_events[] = { + static const u8 rfseq_tx2rx_events[] = { NPHY_RFSEQ_CMD_NOP, NPHY_RFSEQ_CMD_EXT_PA, NPHY_RFSEQ_CMD_TX_GAIN, @@ -16085,8 +16082,8 @@ static void wlc_phy_workarounds_nphy(struct brcms_phy *pi) NPHY_RFSEQ_CMD_RXG_FBW, NPHY_RFSEQ_CMD_CLR_HIQ_DIS }; - u8 rfseq_tx2rx_dlys[] = { 8, 6, 2, 4, 4, 6, 1 }; - u8 rfseq_tx2rx_events_rev3[] = { + static const u8 rfseq_tx2rx_dlys[] = { 8, 6, 2, 4, 4, 6, 1 }; + static const u8 rfseq_tx2rx_events_rev3[] = { NPHY_REV3_RFSEQ_CMD_EXT_PA, NPHY_REV3_RFSEQ_CMD_INT_PA_PU, NPHY_REV3_RFSEQ_CMD_TX_GAIN, @@ -16096,7 +16093,7 @@ static void wlc_phy_workarounds_nphy(struct brcms_phy *pi) NPHY_REV3_RFSEQ_CMD_CLR_HIQ_DIS, NPHY_REV3_RFSEQ_CMD_END }; - u8 rfseq_tx2rx_dlys_rev3[] = { 8, 4, 2, 2, 4, 4, 6, 1 }; + static const u8 rfseq_tx2rx_dlys_rev3[] = { 8, 4, 2, 2, 4, 4, 6, 1 }; u8 rfseq_rx2tx_events_rev3[] = { NPHY_REV3_RFSEQ_CMD_NOP, NPHY_REV3_RFSEQ_CMD_RXG_FBW, @@ -16110,7 +16107,7 @@ static void wlc_phy_workarounds_nphy(struct brcms_phy *pi) }; u8 rfseq_rx2tx_dlys_rev3[] = { 8, 6, 6, 4, 4, 18, 42, 1, 1 }; - u8 rfseq_rx2tx_events_rev3_ipa[] = { + static const u8 rfseq_rx2tx_events_rev3_ipa[] = { NPHY_REV3_RFSEQ_CMD_NOP, NPHY_REV3_RFSEQ_CMD_RXG_FBW, NPHY_REV3_RFSEQ_CMD_TR_SWITCH, @@ -16121,15 +16118,15 @@ static void wlc_phy_workarounds_nphy(struct brcms_phy *pi) NPHY_REV3_RFSEQ_CMD_INT_PA_PU, NPHY_REV3_RFSEQ_CMD_END }; - u8 rfseq_rx2tx_dlys_rev3_ipa[] = { 8, 6, 6, 4, 4, 16, 43, 1, 1 }; - u16 rfseq_rx2tx_dacbufpu_rev7[] = { 0x10f, 0x10f }; + static const u8 rfseq_rx2tx_dlys_rev3_ipa[] = { 8, 6, 6, 4, 4, 16, 43, 1, 1 }; + static const u16 rfseq_rx2tx_dacbufpu_rev7[] = { 0x10f, 0x10f }; s16 alpha0, alpha1, alpha2; s16 beta0, beta1, beta2; u32 leg_data_weights, ht_data_weights, nss1_data_weights, stbc_data_weights; u8 chan_freq_range = 0; - u16 dac_control = 0x0002; + static const u16 dac_control = 0x0002; u16 aux_adc_vmid_rev7_core0[] = { 0x8e, 0x96, 0x96, 0x96 }; u16 aux_adc_vmid_rev7_core1[] = { 0x8f, 0x9f, 0x9f, 0x96 }; u16 aux_adc_vmid_rev4[] = { 0xa2, 0xb4, 0xb4, 0x89 }; @@ -16139,8 +16136,8 @@ static void wlc_phy_workarounds_nphy(struct brcms_phy *pi) u16 aux_adc_gain_rev4[] = { 0x02, 0x02, 0x02, 0x00 }; u16 aux_adc_gain_rev3[] = { 0x02, 0x02, 0x02, 0x00 }; u16 *aux_adc_gain; - u16 sk_adc_vmid[] = { 0xb4, 0xb4, 0xb4, 0x24 }; - u16 sk_adc_gain[] = { 0x02, 0x02, 0x02, 0x02 }; + static const u16 sk_adc_vmid[] = { 0xb4, 0xb4, 0xb4, 0x24 }; + static const u16 sk_adc_gain[] = { 0x02, 0x02, 0x02, 0x02 }; s32 min_nvar_val = 0x18d; s32 min_nvar_offset_6mbps = 20; u8 pdetrange; @@ -16151,9 +16148,9 @@ static void wlc_phy_workarounds_nphy(struct brcms_phy *pi) u16 rfseq_rx2tx_lpf_h_hpc_rev7 = 0x77; u16 rfseq_tx2rx_lpf_h_hpc_rev7 = 0x77; u16 rfseq_pktgn_lpf_h_hpc_rev7 = 0x77; - u16 rfseq_htpktgn_lpf_hpc_rev7[] = { 0x77, 0x11, 0x11 }; - u16 rfseq_pktgn_lpf_hpc_rev7[] = { 0x11, 0x11 }; - u16 rfseq_cckpktgn_lpf_hpc_rev7[] = { 0x11, 0x11 }; + static const u16 rfseq_htpktgn_lpf_hpc_rev7[] = { 0x77, 0x11, 0x11 }; + static const u16 rfseq_pktgn_lpf_hpc_rev7[] = { 0x11, 0x11 }; + static const u16 rfseq_cckpktgn_lpf_hpc_rev7[] = { 0x11, 0x11 }; u16 ipalvlshift_3p3_war_en = 0; u16 rccal_bcap_val, rccal_scap_val; u16 rccal_tx20_11b_bcap = 0; @@ -24291,13 +24288,13 @@ static void wlc_phy_update_txcal_ladder_nphy(struct brcms_phy *pi, u16 core) u16 bbmult; u16 tblentry; - struct nphy_txiqcal_ladder ladder_lo[] = { + static const struct nphy_txiqcal_ladder ladder_lo[] = { {3, 0}, {4, 0}, {6, 0}, {9, 0}, {13, 0}, {18, 0}, {25, 0}, {25, 1}, {25, 2}, {25, 3}, {25, 4}, {25, 5}, {25, 6}, {25, 7}, {35, 7}, {50, 7}, {71, 7}, {100, 7} }; - struct nphy_txiqcal_ladder ladder_iq[] = { + static const struct nphy_txiqcal_ladder ladder_iq[] = { {3, 0}, {4, 0}, {6, 0}, {9, 0}, {13, 0}, {18, 0}, {25, 0}, {35, 0}, {50, 0}, {71, 0}, {100, 0}, {100, 1}, {100, 2}, {100, 3}, {100, 4}, {100, 5}, {100, 6}, {100, 7} @@ -25773,67 +25770,67 @@ wlc_phy_cal_txiqlo_nphy(struct brcms_phy *pi, struct nphy_txgains target_gain, u16 cal_gain[2]; struct nphy_iqcal_params cal_params[2]; u32 tbl_len; - void *tbl_ptr; + const void *tbl_ptr; bool ladder_updated[2]; u8 mphase_cal_lastphase = 0; int bcmerror = 0; bool phyhang_avoid_state = false; - u16 tbl_tx_iqlo_cal_loft_ladder_20[] = { + static const u16 tbl_tx_iqlo_cal_loft_ladder_20[] = { 0x0300, 0x0500, 0x0700, 0x0900, 0x0d00, 0x1100, 0x1900, 0x1901, 0x1902, 0x1903, 0x1904, 0x1905, 0x1906, 0x1907, 0x2407, 0x3207, 0x4607, 0x6407 }; - u16 tbl_tx_iqlo_cal_iqimb_ladder_20[] = { + static const u16 tbl_tx_iqlo_cal_iqimb_ladder_20[] = { 0x0200, 0x0300, 0x0600, 0x0900, 0x0d00, 0x1100, 0x1900, 0x2400, 0x3200, 0x4600, 0x6400, 0x6401, 0x6402, 0x6403, 0x6404, 0x6405, 0x6406, 0x6407 }; - u16 tbl_tx_iqlo_cal_loft_ladder_40[] = { + static const u16 tbl_tx_iqlo_cal_loft_ladder_40[] = { 0x0200, 0x0300, 0x0400, 0x0700, 0x0900, 0x0c00, 0x1200, 0x1201, 0x1202, 0x1203, 0x1204, 0x1205, 0x1206, 0x1207, 0x1907, 0x2307, 0x3207, 0x4707 }; - u16 tbl_tx_iqlo_cal_iqimb_ladder_40[] = { + static const u16 tbl_tx_iqlo_cal_iqimb_ladder_40[] = { 0x0100, 0x0200, 0x0400, 0x0700, 0x0900, 0x0c00, 0x1200, 0x1900, 0x2300, 0x3200, 0x4700, 0x4701, 0x4702, 0x4703, 0x4704, 0x4705, 0x4706, 0x4707 }; - u16 tbl_tx_iqlo_cal_startcoefs[] = { + static const u16 tbl_tx_iqlo_cal_startcoefs[] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; - u16 tbl_tx_iqlo_cal_cmds_fullcal[] = { + static const u16 tbl_tx_iqlo_cal_cmds_fullcal[] = { 0x8123, 0x8264, 0x8086, 0x8245, 0x8056, 0x9123, 0x9264, 0x9086, 0x9245, 0x9056 }; - u16 tbl_tx_iqlo_cal_cmds_recal[] = { + static const u16 tbl_tx_iqlo_cal_cmds_recal[] = { 0x8101, 0x8253, 0x8053, 0x8234, 0x8034, 0x9101, 0x9253, 0x9053, 0x9234, 0x9034 }; - u16 tbl_tx_iqlo_cal_startcoefs_nphyrev3[] = { + static const u16 tbl_tx_iqlo_cal_startcoefs_nphyrev3[] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; - u16 tbl_tx_iqlo_cal_cmds_fullcal_nphyrev3[] = { + static const u16 tbl_tx_iqlo_cal_cmds_fullcal_nphyrev3[] = { 0x8434, 0x8334, 0x8084, 0x8267, 0x8056, 0x8234, 0x9434, 0x9334, 0x9084, 0x9267, 0x9056, 0x9234 }; - u16 tbl_tx_iqlo_cal_cmds_recal_nphyrev3[] = { + static const u16 tbl_tx_iqlo_cal_cmds_recal_nphyrev3[] = { 0x8423, 0x8323, 0x8073, 0x8256, 0x8045, 0x8223, 0x9423, 0x9323, 0x9073, 0x9256, 0x9045, 0x9223 }; -- cgit v1.2.3 From b178c94efdfd7e7c649277c0f570c5db14aaba4f Mon Sep 17 00:00:00 2001 From: Jan Luebbe Date: Mon, 28 Aug 2017 17:25:16 +0200 Subject: bus: mbus: fix window size calculation for 4GB windows commit 2bbbd96357ce76cc45ec722c00f654aa7b189112 upstream. At least the Armada XP SoC supports 4GB on a single DRAM window. Because the size register values contain the actual size - 1, the MSB is set in that case. For example, the SDRAM window's control register's value is 0xffffffe1 for 4GB (bits 31 to 24 contain the size). The MBUS driver reads back each window's size from registers and calculates the actual size as (control_reg | ~DDR_SIZE_MASK) + 1, which overflows for 32 bit values, resulting in other miscalculations further on (a bad RAM window for the CESA crypto engine calculated by mvebu_mbus_setup_cpu_target_nooverlap() in my case). This patch changes the type in 'struct mbus_dram_window' from u32 to u64, which allows us to keep using the same register calculation code in most MBUS-using drivers (which calculate ->size - 1 again). Fixes: fddddb52a6c4 ("bus: introduce an Marvell EBU MBus driver") Signed-off-by: Jan Luebbe Signed-off-by: Gregory CLEMENT Signed-off-by: Greg Kroah-Hartman --- drivers/bus/mvebu-mbus.c | 2 +- include/linux/mbus.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/bus/mvebu-mbus.c b/drivers/bus/mvebu-mbus.c index c43c3d2baf73..0d628becf37f 100644 --- a/drivers/bus/mvebu-mbus.c +++ b/drivers/bus/mvebu-mbus.c @@ -720,7 +720,7 @@ mvebu_mbus_default_setup_cpu_target(struct mvebu_mbus_state *mbus) if (mbus->hw_io_coherency) w->mbus_attr |= ATTR_HW_COHERENCY; w->base = base & DDR_BASE_CS_LOW_MASK; - w->size = (size | ~DDR_SIZE_MASK) + 1; + w->size = (u64)(size | ~DDR_SIZE_MASK) + 1; } } mvebu_mbus_dram_info.num_cs = cs; diff --git a/include/linux/mbus.h b/include/linux/mbus.h index 1f7bc630d225..71a5a56b0bba 100644 --- a/include/linux/mbus.h +++ b/include/linux/mbus.h @@ -29,8 +29,8 @@ struct mbus_dram_target_info struct mbus_dram_window { u8 cs_index; u8 mbus_attr; - u32 base; - u32 size; + u64 base; + u64 size; } cs[4]; }; -- cgit v1.2.3 From cffdaa65e72f76efc821d551b78ceec1f76196be Mon Sep 17 00:00:00 2001 From: David Kozub Date: Thu, 19 Oct 2017 22:57:02 +0200 Subject: clockevents/drivers/cs5535: Improve resilience to spurious interrupts commit eb39a7c0355393c5a8d930f342ad7a6231b552c4 upstream. The interrupt handler mfgpt_tick() is not robust versus spurious interrupts which happen before the clock event device is registered and fully initialized. The reason is that the safe guard against spurious interrupts solely checks for the clockevents shutdown state, but lacks a check for detached state. If the interrupt hits while the device is in detached state it passes the safe guard and dereferences the event handler call back which is NULL. Add the missing state check. Fixes: 8f9327cbb6e8 ("clockevents/drivers/cs5535: Migrate to new 'set-state' interface") Suggested-by: Thomas Gleixner Signed-off-by: David Kozub Signed-off-by: Thomas Gleixner Cc: Daniel Lezcano Link: https://lkml.kernel.org/r/20171020093103.3317F6004D@linux.fjfi.cvut.cz Signed-off-by: Greg Kroah-Hartman --- drivers/clocksource/cs5535-clockevt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/cs5535-clockevt.c b/drivers/clocksource/cs5535-clockevt.c index 9a7e37cf56b0..e1d7373e63e0 100644 --- a/drivers/clocksource/cs5535-clockevt.c +++ b/drivers/clocksource/cs5535-clockevt.c @@ -117,7 +117,8 @@ static irqreturn_t mfgpt_tick(int irq, void *dev_id) /* Turn off the clock (and clear the event) */ disable_timer(cs5535_event_clock); - if (clockevent_state_shutdown(&cs5535_clockevent)) + if (clockevent_state_detached(&cs5535_clockevent) || + clockevent_state_shutdown(&cs5535_clockevent)) return IRQ_HANDLED; /* Clear the counter */ -- cgit v1.2.3 From 51ba40fcfd6784c7576268aa9de23630c397f387 Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Wed, 20 Sep 2017 16:15:05 -0500 Subject: rtlwifi: rtl8821ae: Fix connection lost problem commit b8b8b16352cd90c6083033fd4487f04fae935c18 upstream. In commit 40b368af4b75 ("rtlwifi: Fix alignment issues"), the read of REG_DBI_READ was changed from 16 to 8 bits. For unknown reasonsi this change results in reduced stability for the wireless connection. This regression was located using bisection. Fixes: 40b368af4b75 ("rtlwifi: Fix alignment issues") Reported-and-tested-by: James Cameron Signed-off-by: Larry Finger Cc: Ping-Ke Shih Signed-off-by: Kalle Valo Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c index c2103e7a8132..bbb789f8990b 100644 --- a/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c +++ b/drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c @@ -1127,7 +1127,7 @@ static u8 _rtl8821ae_dbi_read(struct rtl_priv *rtlpriv, u16 addr) } if (0 == tmp) { read_addr = REG_DBI_RDATA + addr % 4; - ret = rtl_read_byte(rtlpriv, read_addr); + ret = rtl_read_word(rtlpriv, read_addr); } return ret; } -- cgit v1.2.3 From 2b7e02267d3c8049b70fc44c410573fe0de8e6dc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:37:49 -0700 Subject: KEYS: encrypted: fix dereference of NULL user_key_payload commit 13923d0865ca96312197962522e88bc0aedccd74 upstream. A key of type "encrypted" references a "master key" which is used to encrypt and decrypt the encrypted key's payload. However, when we accessed the master key's payload, we failed to handle the case where the master key has been revoked, which sets the payload pointer to NULL. Note that request_key() *does* skip revoked keys, but there is still a window where the key can be revoked before we acquire its semaphore. Fix it by checking for a NULL payload, treating it like a key which was already revoked at the time it was requested. This was an issue for master keys of type "user" only. Master keys can also be of type "trusted", but those cannot be revoked. Fixes: 7e70cb497850 ("keys: add new key-type encrypted") Reviewed-by: James Morris Cc: Mimi Zohar Cc: David Safford Signed-off-by: Eric Biggers Signed-off-by: David Howells Signed-off-by: Greg Kroah-Hartman --- security/keys/encrypted-keys/encrypted.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c index 31898856682e..dbd75de136d7 100644 --- a/security/keys/encrypted-keys/encrypted.c +++ b/security/keys/encrypted-keys/encrypted.c @@ -315,6 +315,13 @@ static struct key *request_user_key(const char *master_desc, const u8 **master_k down_read(&ukey->sem); upayload = user_key_payload(ukey); + if (!upayload) { + /* key was revoked before we acquired its semaphore */ + up_read(&ukey->sem); + key_put(ukey); + ukey = ERR_PTR(-EKEYREVOKED); + goto error; + } *master_key = upayload->data; *master_keylen = upayload->datalen; error: -- cgit v1.2.3 From 503ef5c070a106b52fe34a04fdf02cf1f5662150 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:43:20 -0700 Subject: lib/digsig: fix dereference of NULL user_key_payload commit 192cabd6a296cbc57b3d8c05c4c89d87fc102506 upstream. digsig_verify() requests a user key, then accesses its payload. However, a revoked key has a NULL payload, and we failed to check for this. request_key() *does* skip revoked keys, but there is still a window where the key can be revoked before we acquire its semaphore. Fix it by checking for a NULL payload, treating it like a key which was already revoked at the time it was requested. Fixes: 051dbb918c7f ("crypto: digital signature verification support") Reviewed-by: James Morris Cc: Dmitry Kasatkin Signed-off-by: Eric Biggers Signed-off-by: David Howells Signed-off-by: Greg Kroah-Hartman --- lib/digsig.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/digsig.c b/lib/digsig.c index 07be6c1ef4e2..00c5c8179393 100644 --- a/lib/digsig.c +++ b/lib/digsig.c @@ -87,6 +87,12 @@ static int digsig_verify_rsa(struct key *key, down_read(&key->sem); ukp = user_key_payload(key); + if (!ukp) { + /* key was revoked before we acquired its semaphore */ + err = -EKEYREVOKED; + goto err1; + } + if (ukp->datalen < sizeof(*pkh)) goto err1; -- cgit v1.2.3 From 33dea302f9bc1e2c41392a308cfb50f6c02bb096 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 12 Oct 2017 16:00:41 +0100 Subject: KEYS: don't let add_key() update an uninstantiated key commit 60ff5b2f547af3828aebafd54daded44cfb0807a upstream. Currently, when passed a key that already exists, add_key() will call the key's ->update() method if such exists. But this is heavily broken in the case where the key is uninstantiated because it doesn't call __key_instantiate_and_link(). Consequently, it doesn't do most of the things that are supposed to happen when the key is instantiated, such as setting the instantiation state, clearing KEY_FLAG_USER_CONSTRUCT and awakening tasks waiting on it, and incrementing key->user->nikeys. It also never takes key_construction_mutex, which means that ->instantiate() can run concurrently with ->update() on the same key. In the case of the "user" and "logon" key types this causes a memory leak, at best. Maybe even worse, the ->update() methods of the "encrypted" and "trusted" key types actually just dereference a NULL pointer when passed an uninstantiated key. Change key_create_or_update() to wait interruptibly for the key to finish construction before continuing. This patch only affects *uninstantiated* keys. For now we still allow a negatively instantiated key to be updated (thereby positively instantiating it), although that's broken too (the next patch fixes it) and I'm not sure that anyone actually uses that functionality either. Here is a simple reproducer for the bug using the "encrypted" key type (requires CONFIG_ENCRYPTED_KEYS=y), though as noted above the bug pertained to more than just the "encrypted" key type: #include #include #include int main(void) { int ringid = keyctl_join_session_keyring(NULL); if (fork()) { for (;;) { const char payload[] = "update user:foo 32"; usleep(rand() % 10000); add_key("encrypted", "desc", payload, sizeof(payload), ringid); keyctl_clear(ringid); } } else { for (;;) request_key("encrypted", "desc", "callout_info", ringid); } } It causes: BUG: unable to handle kernel NULL pointer dereference at 0000000000000018 IP: encrypted_update+0xb0/0x170 PGD 7a178067 P4D 7a178067 PUD 77269067 PMD 0 PREEMPT SMP CPU: 0 PID: 340 Comm: reproduce Tainted: G D 4.14.0-rc1-00025-g428490e38b2e #796 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 task: ffff8a467a39a340 task.stack: ffffb15c40770000 RIP: 0010:encrypted_update+0xb0/0x170 RSP: 0018:ffffb15c40773de8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff8a467a275b00 RCX: 0000000000000000 RDX: 0000000000000005 RSI: ffff8a467a275b14 RDI: ffffffffb742f303 RBP: ffffb15c40773e20 R08: 0000000000000000 R09: ffff8a467a275b17 R10: 0000000000000020 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: ffff8a4677057180 R15: ffff8a467a275b0f FS: 00007f5d7fb08700(0000) GS:ffff8a467f200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000018 CR3: 0000000077262005 CR4: 00000000001606f0 Call Trace: key_create_or_update+0x2bc/0x460 SyS_add_key+0x10c/0x1d0 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x7f5d7f211259 RSP: 002b:00007ffed03904c8 EFLAGS: 00000246 ORIG_RAX: 00000000000000f8 RAX: ffffffffffffffda RBX: 000000003b2a7955 RCX: 00007f5d7f211259 RDX: 00000000004009e4 RSI: 00000000004009ff RDI: 0000000000400a04 RBP: 0000000068db8bad R08: 000000003b2a7955 R09: 0000000000000004 R10: 000000000000001a R11: 0000000000000246 R12: 0000000000400868 R13: 00007ffed03905d0 R14: 0000000000000000 R15: 0000000000000000 Code: 77 28 e8 64 34 1f 00 45 31 c0 31 c9 48 8d 55 c8 48 89 df 48 8d 75 d0 e8 ff f9 ff ff 85 c0 41 89 c4 0f 88 84 00 00 00 4c 8b 7d c8 <49> 8b 75 18 4c 89 ff e8 24 f8 ff ff 85 c0 41 89 c4 78 6d 49 8b RIP: encrypted_update+0xb0/0x170 RSP: ffffb15c40773de8 CR2: 0000000000000018 Reported-by: Eric Biggers Signed-off-by: David Howells cc: Eric Biggers Signed-off-by: Greg Kroah-Hartman --- security/keys/key.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/security/keys/key.c b/security/keys/key.c index 51d23c623424..2751ab4a7946 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -907,6 +907,16 @@ error: */ __key_link_end(keyring, &index_key, edit); + key = key_ref_to_ptr(key_ref); + if (test_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) { + ret = wait_for_key_construction(key, true); + if (ret < 0) { + key_ref_put(key_ref); + key_ref = ERR_PTR(ret); + goto error_free_prep; + } + } + key_ref = __key_update(key_ref, &prep); goto error_free_prep; } -- cgit v1.2.3 From 6f0dee7d9c9b815c45b96cad2ab2958b4afe5840 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sun, 8 Oct 2017 20:02:32 +0200 Subject: pkcs7: Prevent NULL pointer dereference, since sinfo is not always set. commit 68a1fdbbf8bd3378325e45c19e167a165f9ffc3a upstream. The ASN.1 parser does not necessarily set the sinfo field, this patch prevents a NULL pointer dereference on broken input. Fixes: 99db44350672 ("PKCS#7: Appropriately restrict authenticated attributes and content type") Signed-off-by: Eric Sesterhenn Signed-off-by: David Howells Signed-off-by: Greg Kroah-Hartman --- crypto/asymmetric_keys/pkcs7_parser.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c index 8f3056cd0399..2516e97c58f1 100644 --- a/crypto/asymmetric_keys/pkcs7_parser.c +++ b/crypto/asymmetric_keys/pkcs7_parser.c @@ -90,6 +90,9 @@ static int pkcs7_check_authattrs(struct pkcs7_message *msg) bool want; sinfo = msg->signed_infos; + if (!sinfo) + goto inconsistent; + if (sinfo->authattrs) { want = true; msg->have_authattrs = true; -- cgit v1.2.3 From 558ca24dc296a859af75edf495a0972a00e9200d Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Fri, 28 Oct 2016 22:13:42 +0200 Subject: parisc: Avoid trashing sr2 and sr3 in LWS code commit f4125cfdb3008363137f744c101e5d76ead760ba upstream. There is no need to trash sr2 and sr3 in the Light-weight syscall (LWS). sr2 already points to kernel space (it's zero in userspace, otherwise syscalls wouldn't work), and since the LWS code is executed in userspace, we can simply ignore to preload sr3. Signed-off-by: John David Anglin Signed-off-by: Helge Deller Signed-off-by: Greg Kroah-Hartman --- arch/parisc/kernel/syscall.S | 53 ++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S index a86b19fccb63..196973ead9b8 100644 --- a/arch/parisc/kernel/syscall.S +++ b/arch/parisc/kernel/syscall.S @@ -479,11 +479,6 @@ lws_start: comiclr,>> __NR_lws_entries, %r20, %r0 b,n lws_exit_nosys - /* WARNING: Trashing sr2 and sr3 */ - mfsp %sr7,%r1 /* get userspace into sr3 */ - mtsp %r1,%sr3 - mtsp %r0,%sr2 /* get kernel space into sr2 */ - /* Load table start */ ldil L%lws_table, %r1 ldo R%lws_table(%r1), %r28 /* Scratch use of r28 */ @@ -632,9 +627,9 @@ cas_action: stw %r1, 4(%sr2,%r20) #endif /* The load and store could fail */ -1: ldw,ma 0(%sr3,%r26), %r28 +1: ldw,ma 0(%r26), %r28 sub,<> %r28, %r25, %r0 -2: stw,ma %r24, 0(%sr3,%r26) +2: stw,ma %r24, 0(%r26) /* Free lock */ stw,ma %r20, 0(%sr2,%r20) #if ENABLE_LWS_DEBUG @@ -711,9 +706,9 @@ lws_compare_and_swap_2: nop /* 8bit load */ -4: ldb 0(%sr3,%r25), %r25 +4: ldb 0(%r25), %r25 b cas2_lock_start -5: ldb 0(%sr3,%r24), %r24 +5: ldb 0(%r24), %r24 nop nop nop @@ -721,9 +716,9 @@ lws_compare_and_swap_2: nop /* 16bit load */ -6: ldh 0(%sr3,%r25), %r25 +6: ldh 0(%r25), %r25 b cas2_lock_start -7: ldh 0(%sr3,%r24), %r24 +7: ldh 0(%r24), %r24 nop nop nop @@ -731,9 +726,9 @@ lws_compare_and_swap_2: nop /* 32bit load */ -8: ldw 0(%sr3,%r25), %r25 +8: ldw 0(%r25), %r25 b cas2_lock_start -9: ldw 0(%sr3,%r24), %r24 +9: ldw 0(%r24), %r24 nop nop nop @@ -742,14 +737,14 @@ lws_compare_and_swap_2: /* 64bit load */ #ifdef CONFIG_64BIT -10: ldd 0(%sr3,%r25), %r25 -11: ldd 0(%sr3,%r24), %r24 +10: ldd 0(%r25), %r25 +11: ldd 0(%r24), %r24 #else /* Load new value into r22/r23 - high/low */ -10: ldw 0(%sr3,%r25), %r22 -11: ldw 4(%sr3,%r25), %r23 +10: ldw 0(%r25), %r22 +11: ldw 4(%r25), %r23 /* Load new value into fr4 for atomic store later */ -12: flddx 0(%sr3,%r24), %fr4 +12: flddx 0(%r24), %fr4 #endif cas2_lock_start: @@ -799,30 +794,30 @@ cas2_action: ldo 1(%r0),%r28 /* 8bit CAS */ -13: ldb,ma 0(%sr3,%r26), %r29 +13: ldb,ma 0(%r26), %r29 sub,= %r29, %r25, %r0 b,n cas2_end -14: stb,ma %r24, 0(%sr3,%r26) +14: stb,ma %r24, 0(%r26) b cas2_end copy %r0, %r28 nop nop /* 16bit CAS */ -15: ldh,ma 0(%sr3,%r26), %r29 +15: ldh,ma 0(%r26), %r29 sub,= %r29, %r25, %r0 b,n cas2_end -16: sth,ma %r24, 0(%sr3,%r26) +16: sth,ma %r24, 0(%r26) b cas2_end copy %r0, %r28 nop nop /* 32bit CAS */ -17: ldw,ma 0(%sr3,%r26), %r29 +17: ldw,ma 0(%r26), %r29 sub,= %r29, %r25, %r0 b,n cas2_end -18: stw,ma %r24, 0(%sr3,%r26) +18: stw,ma %r24, 0(%r26) b cas2_end copy %r0, %r28 nop @@ -830,22 +825,22 @@ cas2_action: /* 64bit CAS */ #ifdef CONFIG_64BIT -19: ldd,ma 0(%sr3,%r26), %r29 +19: ldd,ma 0(%r26), %r29 sub,*= %r29, %r25, %r0 b,n cas2_end -20: std,ma %r24, 0(%sr3,%r26) +20: std,ma %r24, 0(%r26) copy %r0, %r28 #else /* Compare first word */ -19: ldw,ma 0(%sr3,%r26), %r29 +19: ldw,ma 0(%r26), %r29 sub,= %r29, %r22, %r0 b,n cas2_end /* Compare second word */ -20: ldw,ma 4(%sr3,%r26), %r29 +20: ldw,ma 4(%r26), %r29 sub,= %r29, %r23, %r0 b,n cas2_end /* Perform the store */ -21: fstdx %fr4, 0(%sr3,%r26) +21: fstdx %fr4, 0(%r26) copy %r0, %r28 #endif -- cgit v1.2.3 From fcc65ab173ebf797472b046f2d84663fbbe443a7 Mon Sep 17 00:00:00 2001 From: John David Anglin Date: Sat, 30 Sep 2017 17:24:23 -0400 Subject: parisc: Fix double-word compare and exchange in LWS code on 32-bit kernels commit 374b3bf8e8b519f61eb9775888074c6e46b3bf0c upstream. As discussed on the debian-hppa list, double-wordcompare and exchange operations fail on 32-bit kernels. Looking at the code, I realized that the ",ma" completer does the wrong thing in the "ldw,ma 4(%r26), %r29" instruction. This increments %r26 and causes the following store to write to the wrong location. Note by Helge Deller: The patch applies cleanly to stable kernel series if this upstream commit is merged in advance: f4125cfdb300 ("parisc: Avoid trashing sr2 and sr3 in LWS code"). Signed-off-by: John David Anglin Tested-by: Christoph Biedl Fixes: 89206491201c ("parisc: Implement new LWS CAS supporting 64 bit operations.") Signed-off-by: Helge Deller Signed-off-by: Greg Kroah-Hartman --- arch/parisc/kernel/syscall.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S index 196973ead9b8..c6b855f7892c 100644 --- a/arch/parisc/kernel/syscall.S +++ b/arch/parisc/kernel/syscall.S @@ -740,7 +740,7 @@ lws_compare_and_swap_2: 10: ldd 0(%r25), %r25 11: ldd 0(%r24), %r24 #else - /* Load new value into r22/r23 - high/low */ + /* Load old value into r22/r23 - high/low */ 10: ldw 0(%r25), %r22 11: ldw 4(%r25), %r23 /* Load new value into fr4 for atomic store later */ @@ -832,11 +832,11 @@ cas2_action: copy %r0, %r28 #else /* Compare first word */ -19: ldw,ma 0(%r26), %r29 +19: ldw 0(%r26), %r29 sub,= %r29, %r22, %r0 b,n cas2_end /* Compare second word */ -20: ldw,ma 4(%r26), %r29 +20: ldw 4(%r26), %r29 sub,= %r29, %r23, %r0 b,n cas2_end /* Perform the store */ -- cgit v1.2.3 From 0f85c0954be46bbd36960191daa447ad86b98f0b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 14 Nov 2016 19:46:09 +0100 Subject: sched/autogroup: Fix autogroup_move_group() to never skip sched_move_task() commit 18f649ef344127ef6de23a5a4272dbe2fdb73dde upstream. The PF_EXITING check in task_wants_autogroup() is no longer needed. Remove it, but see the next patch. However the comment is correct in that autogroup_move_group() must always change task_group() for every thread so the sysctl_ check is very wrong; we can race with cgroups and even sys_setsid() is not safe because a task running with task_group() == ag->tg must participate in refcounting: int main(void) { int sctl = open("/proc/sys/kernel/sched_autogroup_enabled", O_WRONLY); assert(sctl > 0); if (fork()) { wait(NULL); // destroy the child's ag/tg pause(); } assert(pwrite(sctl, "1\n", 2, 0) == 2); assert(setsid() > 0); if (fork()) pause(); kill(getppid(), SIGKILL); sleep(1); // The child has gone, the grandchild runs with kref == 1 assert(pwrite(sctl, "0\n", 2, 0) == 2); assert(setsid() > 0); // runs with the freed ag/tg for (;;) sleep(1); return 0; } crashes the kernel. It doesn't really need sleep(1), it doesn't matter if autogroup_move_group() actually frees the task_group or this happens later. Reported-by: Vern Lovejoy Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: hartsjc@redhat.com Cc: vbendel@redhat.com Link: http://lkml.kernel.org/r/20161114184609.GA15965@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Sumit Semwal [sumits: submit to 4.4 LTS, post testing on Hikey] Signed-off-by: Greg Kroah-Hartman --- kernel/sched/auto_group.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 750ed601ddf7..8620fd01b3d0 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -111,14 +111,11 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) { if (tg != &root_task_group) return false; - /* - * We can only assume the task group can't go away on us if - * autogroup_move_group() can see us on ->thread_group list. + * If we race with autogroup_move_group() the caller can use the old + * value of signal->autogroup but in this case sched_move_task() will + * be called again before autogroup_kref_put(). */ - if (p->flags & PF_EXITING) - return false; - return true; } @@ -138,13 +135,17 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) } p->signal->autogroup = autogroup_kref_get(ag); - - if (!READ_ONCE(sysctl_sched_autogroup_enabled)) - goto out; - + /* + * We can't avoid sched_move_task() after we changed signal->autogroup, + * this process can already run with task_group() == prev->tg or we can + * race with cgroup code which can read autogroup = prev under rq->lock. + * In the latter case for_each_thread() can not miss a migrating thread, + * cpu_cgroup_attach() must not be possible after cgroup_exit() and it + * can't be removed from thread list, we hold ->siglock. + */ for_each_thread(p, t) sched_move_task(t); -out: + unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } -- cgit v1.2.3 From 4db9f1113196e7b4df4e754e7e770b22aee81c01 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 5 Feb 2016 19:19:01 -0800 Subject: f2fs crypto: replace some BUG_ON()'s with error checks commit 66aa3e1274fcf887e9d6501a68163270fc7718e7 upstream. This patch adopts: ext4 crypto: replace some BUG_ON()'s with error checks Signed-off-by: Theodore Ts'o Signed-off-by: Jaegeuk Kim Signed-off-by: Eric Biggers Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/crypto.c | 1 - fs/f2fs/crypto_fname.c | 2 -- fs/f2fs/crypto_key.c | 15 ++++++++++++--- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/crypto.c b/fs/f2fs/crypto.c index 4a62ef14e932..d879c6c846b7 100644 --- a/fs/f2fs/crypto.c +++ b/fs/f2fs/crypto.c @@ -362,7 +362,6 @@ static int f2fs_page_crypto(struct f2fs_crypto_ctx *ctx, else res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } diff --git a/fs/f2fs/crypto_fname.c b/fs/f2fs/crypto_fname.c index 38349ed5ea51..0fce444dd5ae 100644 --- a/fs/f2fs/crypto_fname.c +++ b/fs/f2fs/crypto_fname.c @@ -124,7 +124,6 @@ static int f2fs_fname_encrypt(struct inode *inode, ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } @@ -180,7 +179,6 @@ static int f2fs_fname_decrypt(struct inode *inode, ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); res = crypto_ablkcipher_decrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c index 18595d7a0efc..81c87f7a3251 100644 --- a/fs/f2fs/crypto_key.c +++ b/fs/f2fs/crypto_key.c @@ -75,7 +75,6 @@ static int f2fs_derive_key_aes(char deriving_key[F2FS_AES_128_ECB_KEY_SIZE], F2FS_AES_256_XTS_KEY_SIZE, NULL); res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } @@ -189,7 +188,11 @@ int f2fs_get_encryption_info(struct inode *inode) keyring_key = NULL; goto out; } - BUG_ON(keyring_key->type != &key_type_logon); + if (keyring_key->type != &key_type_logon) { + printk_once(KERN_WARNING "f2fs: key type must be logon\n"); + res = -ENOKEY; + goto out; + } ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct f2fs_encryption_key)) { res = -EINVAL; @@ -198,7 +201,13 @@ int f2fs_get_encryption_info(struct inode *inode) master_key = (struct f2fs_encryption_key *)ukp->data; BUILD_BUG_ON(F2FS_AES_128_ECB_KEY_SIZE != F2FS_KEY_DERIVATION_NONCE_SIZE); - BUG_ON(master_key->size != F2FS_AES_256_XTS_KEY_SIZE); + if (master_key->size != F2FS_AES_256_XTS_KEY_SIZE) { + printk_once(KERN_WARNING + "f2fs: key size incorrect: %d\n", + master_key->size); + res = -ENOKEY; + goto out; + } res = f2fs_derive_key_aes(ctx.nonce, master_key->raw, raw_key); if (res) -- cgit v1.2.3 From 7d9e13d953f2a3029d8b26a6f9a7dae83a4594ae Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 5 Feb 2016 19:38:42 -0800 Subject: f2fs crypto: add missing locking for keyring_key access commit 745e8490b1e960ad79859dd8ba6a0b5a8d3d994e upstream. This patch adopts: ext4 crypto: add missing locking for keyring_key access Signed-off-by: Theodore Ts'o Signed-off-by: Jaegeuk Kim Signed-off-by: Eric Biggers Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/crypto_key.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c index 81c87f7a3251..ae49be377b60 100644 --- a/fs/f2fs/crypto_key.c +++ b/fs/f2fs/crypto_key.c @@ -193,9 +193,11 @@ int f2fs_get_encryption_info(struct inode *inode) res = -ENOKEY; goto out; } + down_read(&keyring_key->sem); ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct f2fs_encryption_key)) { res = -EINVAL; + up_read(&keyring_key->sem); goto out; } master_key = (struct f2fs_encryption_key *)ukp->data; @@ -206,10 +208,12 @@ int f2fs_get_encryption_info(struct inode *inode) "f2fs: key size incorrect: %d\n", master_key->size); res = -ENOKEY; + up_read(&keyring_key->sem); goto out; } res = f2fs_derive_key_aes(ctx.nonce, master_key->raw, raw_key); + up_read(&keyring_key->sem); if (res) goto out; -- cgit v1.2.3 From 1dda04c761abf006402f7f5e9adb11f9044731c8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:46:18 -0700 Subject: fscrypt: fix dereference of NULL user_key_payload commit d60b5b7854c3d135b869f74fb93eaf63cbb1991a upstream. When an fscrypt-encrypted file is opened, we request the file's master key from the keyrings service as a logon key, then access its payload. However, a revoked key has a NULL payload, and we failed to check for this. request_key() *does* skip revoked keys, but there is still a window where the key can be revoked before we acquire its semaphore. Fix it by checking for a NULL payload, treating it like a key which was already revoked at the time it was requested. Fixes: 88bd6ccdcdd6 ("ext4 crypto: add encryption key management facilities") Reviewed-by: James Morris Cc: [v4.1+] Signed-off-by: Eric Biggers Signed-off-by: David Howells Signed-off-by: Greg Kroah-Hartman --- fs/ext4/crypto_key.c | 6 ++++++ fs/f2fs/crypto_key.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 505f8afde57c..9a1bc638abce 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -204,6 +204,12 @@ int ext4_get_encryption_info(struct inode *inode) } down_read(&keyring_key->sem); ukp = user_key_payload(keyring_key); + if (!ukp) { + /* key was revoked before we acquired its semaphore */ + res = -EKEYREVOKED; + up_read(&keyring_key->sem); + goto out; + } if (ukp->datalen != sizeof(struct ext4_encryption_key)) { res = -EINVAL; up_read(&keyring_key->sem); diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c index ae49be377b60..7e62889a1d3d 100644 --- a/fs/f2fs/crypto_key.c +++ b/fs/f2fs/crypto_key.c @@ -195,6 +195,12 @@ int f2fs_get_encryption_info(struct inode *inode) } down_read(&keyring_key->sem); ukp = user_key_payload(keyring_key); + if (!ukp) { + /* key was revoked before we acquired its semaphore */ + res = -EKEYREVOKED; + up_read(&keyring_key->sem); + goto out; + } if (ukp->datalen != sizeof(struct f2fs_encryption_key)) { res = -EINVAL; up_read(&keyring_key->sem); -- cgit v1.2.3 From 8a004caec12bf241e567e3640401256cc9bc2e45 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Oct 2017 16:43:25 +0100 Subject: KEYS: Fix race between updating and finding a negative key commit 363b02dab09b3226f3bd1420dad9c72b79a42a76 upstream. Consolidate KEY_FLAG_INSTANTIATED, KEY_FLAG_NEGATIVE and the rejection error into one field such that: (1) The instantiation state can be modified/read atomically. (2) The error can be accessed atomically with the state. (3) The error isn't stored unioned with the payload pointers. This deals with the problem that the state is spread over three different objects (two bits and a separate variable) and reading or updating them atomically isn't practical, given that not only can uninstantiated keys change into instantiated or rejected keys, but rejected keys can also turn into instantiated keys - and someone accessing the key might not be using any locking. The main side effect of this problem is that what was held in the payload may change, depending on the state. For instance, you might observe the key to be in the rejected state. You then read the cached error, but if the key semaphore wasn't locked, the key might've become instantiated between the two reads - and you might now have something in hand that isn't actually an error code. The state is now KEY_IS_UNINSTANTIATED, KEY_IS_POSITIVE or a negative error code if the key is negatively instantiated. The key_is_instantiated() function is replaced with key_is_positive() to avoid confusion as negative keys are also 'instantiated'. Additionally, barriering is included: (1) Order payload-set before state-set during instantiation. (2) Order state-read before payload-read when using the key. Further separate barriering is necessary if RCU is being used to access the payload content after reading the payload pointers. Fixes: 146aa8b1453b ("KEYS: Merge the type-specific data with the payload data") Reported-by: Eric Biggers Signed-off-by: David Howells Reviewed-by: Eric Biggers Signed-off-by: Greg Kroah-Hartman --- include/linux/key.h | 49 ++++++++++++++++++++------------ net/dns_resolver/dns_key.c | 2 +- security/keys/big_key.c | 4 +-- security/keys/encrypted-keys/encrypted.c | 2 +- security/keys/gc.c | 8 +++--- security/keys/key.c | 31 +++++++++++++------- security/keys/keyctl.c | 9 +++--- security/keys/keyring.c | 10 +++---- security/keys/proc.c | 7 +++-- security/keys/process_keys.c | 2 +- security/keys/request_key.c | 7 ++--- security/keys/request_key_auth.c | 2 +- security/keys/trusted.c | 2 +- security/keys/user_defined.c | 4 +-- 14 files changed, 81 insertions(+), 58 deletions(-) diff --git a/include/linux/key.h b/include/linux/key.h index dcc115e8dd03..af071ca73079 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -126,6 +126,11 @@ static inline bool is_key_possessed(const key_ref_t key_ref) return (unsigned long) key_ref & 1UL; } +enum key_state { + KEY_IS_UNINSTANTIATED, + KEY_IS_POSITIVE, /* Positively instantiated */ +}; + /*****************************************************************************/ /* * authentication token / access credential / keyring @@ -157,6 +162,7 @@ struct key { * - may not match RCU dereferenced payload * - payload should contain own length */ + short state; /* Key state (+) or rejection error (-) */ #ifdef KEY_DEBUGGING unsigned magic; @@ -165,19 +171,17 @@ struct key { #endif unsigned long flags; /* status flags (change with bitops) */ -#define KEY_FLAG_INSTANTIATED 0 /* set if key has been instantiated */ -#define KEY_FLAG_DEAD 1 /* set if key type has been deleted */ -#define KEY_FLAG_REVOKED 2 /* set if key had been revoked */ -#define KEY_FLAG_IN_QUOTA 3 /* set if key consumes quota */ -#define KEY_FLAG_USER_CONSTRUCT 4 /* set if key is being constructed in userspace */ -#define KEY_FLAG_NEGATIVE 5 /* set if key is negative */ -#define KEY_FLAG_ROOT_CAN_CLEAR 6 /* set if key can be cleared by root without permission */ -#define KEY_FLAG_INVALIDATED 7 /* set if key has been invalidated */ -#define KEY_FLAG_TRUSTED 8 /* set if key is trusted */ -#define KEY_FLAG_TRUSTED_ONLY 9 /* set if keyring only accepts links to trusted keys */ -#define KEY_FLAG_BUILTIN 10 /* set if key is builtin */ -#define KEY_FLAG_ROOT_CAN_INVAL 11 /* set if key can be invalidated by root without permission */ -#define KEY_FLAG_UID_KEYRING 12 /* set if key is a user or user session keyring */ +#define KEY_FLAG_DEAD 0 /* set if key type has been deleted */ +#define KEY_FLAG_REVOKED 1 /* set if key had been revoked */ +#define KEY_FLAG_IN_QUOTA 2 /* set if key consumes quota */ +#define KEY_FLAG_USER_CONSTRUCT 3 /* set if key is being constructed in userspace */ +#define KEY_FLAG_ROOT_CAN_CLEAR 4 /* set if key can be cleared by root without permission */ +#define KEY_FLAG_INVALIDATED 5 /* set if key has been invalidated */ +#define KEY_FLAG_TRUSTED 6 /* set if key is trusted */ +#define KEY_FLAG_TRUSTED_ONLY 7 /* set if keyring only accepts links to trusted keys */ +#define KEY_FLAG_BUILTIN 8 /* set if key is builtin */ +#define KEY_FLAG_ROOT_CAN_INVAL 9 /* set if key can be invalidated by root without permission */ +#define KEY_FLAG_UID_KEYRING 10 /* set if key is a user or user session keyring */ /* the key type and key description string * - the desc is used to match a key against search criteria @@ -203,7 +207,6 @@ struct key { struct list_head name_link; struct assoc_array keys; }; - int reject_error; }; }; @@ -319,17 +322,27 @@ extern void key_set_timeout(struct key *, unsigned); #define KEY_NEED_SETATTR 0x20 /* Require permission to change attributes */ #define KEY_NEED_ALL 0x3f /* All the above permissions */ +static inline short key_read_state(const struct key *key) +{ + /* Barrier versus mark_key_instantiated(). */ + return smp_load_acquire(&key->state); +} + /** - * key_is_instantiated - Determine if a key has been positively instantiated + * key_is_positive - Determine if a key has been positively instantiated * @key: The key to check. * * Return true if the specified key has been positively instantiated, false * otherwise. */ -static inline bool key_is_instantiated(const struct key *key) +static inline bool key_is_positive(const struct key *key) +{ + return key_read_state(key) == KEY_IS_POSITIVE; +} + +static inline bool key_is_negative(const struct key *key) { - return test_bit(KEY_FLAG_INSTANTIATED, &key->flags) && - !test_bit(KEY_FLAG_NEGATIVE, &key->flags); + return key_read_state(key) < 0; } #define rcu_dereference_key(KEY) \ diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index c79b85eb4d4c..6abc5012200b 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -224,7 +224,7 @@ static int dns_resolver_match_preparse(struct key_match_data *match_data) static void dns_resolver_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); - if (key_is_instantiated(key)) { + if (key_is_positive(key)) { int err = PTR_ERR(key->payload.data[dns_key_error]); if (err) diff --git a/security/keys/big_key.c b/security/keys/big_key.c index 907c1522ee46..08c4cc5c2973 100644 --- a/security/keys/big_key.c +++ b/security/keys/big_key.c @@ -138,7 +138,7 @@ void big_key_revoke(struct key *key) /* clear the quota */ key_payload_reserve(key, 0); - if (key_is_instantiated(key) && + if (key_is_positive(key) && (size_t)key->payload.data[big_key_len] > BIG_KEY_FILE_THRESHOLD) vfs_truncate(path, 0); } @@ -170,7 +170,7 @@ void big_key_describe(const struct key *key, struct seq_file *m) seq_puts(m, key->description); - if (key_is_instantiated(key)) + if (key_is_positive(key)) seq_printf(m, ": %zu [%s]", datalen, datalen > BIG_KEY_FILE_THRESHOLD ? "file" : "buff"); diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c index dbd75de136d7..ce295c0c1da0 100644 --- a/security/keys/encrypted-keys/encrypted.c +++ b/security/keys/encrypted-keys/encrypted.c @@ -852,7 +852,7 @@ static int encrypted_update(struct key *key, struct key_preparsed_payload *prep) size_t datalen = prep->datalen; int ret = 0; - if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) + if (key_is_negative(key)) return -ENOKEY; if (datalen <= 0 || datalen > 32767 || !prep->data) return -EINVAL; diff --git a/security/keys/gc.c b/security/keys/gc.c index 9cb4fe4478a1..1659094d684d 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -129,15 +129,15 @@ static noinline void key_gc_unused_keys(struct list_head *keys) while (!list_empty(keys)) { struct key *key = list_entry(keys->next, struct key, graveyard_link); + short state = key->state; + list_del(&key->graveyard_link); kdebug("- %u", key->serial); key_check(key); /* Throw away the key data if the key is instantiated */ - if (test_bit(KEY_FLAG_INSTANTIATED, &key->flags) && - !test_bit(KEY_FLAG_NEGATIVE, &key->flags) && - key->type->destroy) + if (state == KEY_IS_POSITIVE && key->type->destroy) key->type->destroy(key); security_key_free(key); @@ -151,7 +151,7 @@ static noinline void key_gc_unused_keys(struct list_head *keys) } atomic_dec(&key->user->nkeys); - if (test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) + if (state != KEY_IS_UNINSTANTIATED) atomic_dec(&key->user->nikeys); key_user_put(key->user); diff --git a/security/keys/key.c b/security/keys/key.c index 2751ab4a7946..4d971bf88ac3 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -395,6 +395,18 @@ int key_payload_reserve(struct key *key, size_t datalen) } EXPORT_SYMBOL(key_payload_reserve); +/* + * Change the key state to being instantiated. + */ +static void mark_key_instantiated(struct key *key, int reject_error) +{ + /* Commit the payload before setting the state; barrier versus + * key_read_state(). + */ + smp_store_release(&key->state, + (reject_error < 0) ? reject_error : KEY_IS_POSITIVE); +} + /* * Instantiate a key and link it into the target keyring atomically. Must be * called with the target keyring's semaphore writelocked. The target key's @@ -418,14 +430,14 @@ static int __key_instantiate_and_link(struct key *key, mutex_lock(&key_construction_mutex); /* can't instantiate twice */ - if (!test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) { + if (key->state == KEY_IS_UNINSTANTIATED) { /* instantiate the key */ ret = key->type->instantiate(key, prep); if (ret == 0) { /* mark the key as being instantiated */ atomic_inc(&key->user->nikeys); - set_bit(KEY_FLAG_INSTANTIATED, &key->flags); + mark_key_instantiated(key, 0); if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags)) awaken = 1; @@ -553,13 +565,10 @@ int key_reject_and_link(struct key *key, mutex_lock(&key_construction_mutex); /* can't instantiate twice */ - if (!test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) { + if (key->state == KEY_IS_UNINSTANTIATED) { /* mark the key as being negatively instantiated */ atomic_inc(&key->user->nikeys); - key->reject_error = -error; - smp_wmb(); - set_bit(KEY_FLAG_NEGATIVE, &key->flags); - set_bit(KEY_FLAG_INSTANTIATED, &key->flags); + mark_key_instantiated(key, -error); now = current_kernel_time(); key->expiry = now.tv_sec + timeout; key_schedule_gc(key->expiry + key_gc_delay); @@ -731,8 +740,8 @@ static inline key_ref_t __key_update(key_ref_t key_ref, ret = key->type->update(key, prep); if (ret == 0) - /* updating a negative key instantiates it */ - clear_bit(KEY_FLAG_NEGATIVE, &key->flags); + /* Updating a negative key positively instantiates it */ + mark_key_instantiated(key, 0); up_write(&key->sem); @@ -967,8 +976,8 @@ int key_update(key_ref_t key_ref, const void *payload, size_t plen) ret = key->type->update(key, &prep); if (ret == 0) - /* updating a negative key instantiates it */ - clear_bit(KEY_FLAG_NEGATIVE, &key->flags); + /* Updating a negative key positively instantiates it */ + mark_key_instantiated(key, 0); up_write(&key->sem); diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index a009dc66eb8f..2e741e1a8712 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -738,10 +738,9 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) key = key_ref_to_ptr(key_ref); - if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) { - ret = -ENOKEY; - goto error2; - } + ret = key_read_state(key); + if (ret < 0) + goto error2; /* Negatively instantiated */ /* see if we can read it directly */ ret = key_permission(key_ref, KEY_NEED_READ); @@ -873,7 +872,7 @@ long keyctl_chown_key(key_serial_t id, uid_t user, gid_t group) atomic_dec(&key->user->nkeys); atomic_inc(&newowner->nkeys); - if (test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) { + if (key->state != KEY_IS_UNINSTANTIATED) { atomic_dec(&key->user->nikeys); atomic_inc(&newowner->nikeys); } diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 0c8dd4fbe130..ef828238cdc0 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -407,7 +407,7 @@ static void keyring_describe(const struct key *keyring, struct seq_file *m) else seq_puts(m, "[anon]"); - if (key_is_instantiated(keyring)) { + if (key_is_positive(keyring)) { if (keyring->keys.nr_leaves_on_tree != 0) seq_printf(m, ": %lu", keyring->keys.nr_leaves_on_tree); else @@ -522,7 +522,8 @@ static int keyring_search_iterator(const void *object, void *iterator_data) { struct keyring_search_context *ctx = iterator_data; const struct key *key = keyring_ptr_to_key(object); - unsigned long kflags = key->flags; + unsigned long kflags = READ_ONCE(key->flags); + short state = READ_ONCE(key->state); kenter("{%d}", key->serial); @@ -566,9 +567,8 @@ static int keyring_search_iterator(const void *object, void *iterator_data) if (ctx->flags & KEYRING_SEARCH_DO_STATE_CHECK) { /* we set a different error code if we pass a negative key */ - if (kflags & (1 << KEY_FLAG_NEGATIVE)) { - smp_rmb(); - ctx->result = ERR_PTR(key->reject_error); + if (state < 0) { + ctx->result = ERR_PTR(state); kleave(" = %d [neg]", ctx->skipped_ret); goto skipped; } diff --git a/security/keys/proc.c b/security/keys/proc.c index b9f531c9e4fa..036128682463 100644 --- a/security/keys/proc.c +++ b/security/keys/proc.c @@ -182,6 +182,7 @@ static int proc_keys_show(struct seq_file *m, void *v) unsigned long timo; key_ref_t key_ref, skey_ref; char xbuf[16]; + short state; int rc; struct keyring_search_context ctx = { @@ -240,17 +241,19 @@ static int proc_keys_show(struct seq_file *m, void *v) sprintf(xbuf, "%luw", timo / (60*60*24*7)); } + state = key_read_state(key); + #define showflag(KEY, LETTER, FLAG) \ (test_bit(FLAG, &(KEY)->flags) ? LETTER : '-') seq_printf(m, "%08x %c%c%c%c%c%c%c %5d %4s %08x %5d %5d %-9.9s ", key->serial, - showflag(key, 'I', KEY_FLAG_INSTANTIATED), + state != KEY_IS_UNINSTANTIATED ? 'I' : '-', showflag(key, 'R', KEY_FLAG_REVOKED), showflag(key, 'D', KEY_FLAG_DEAD), showflag(key, 'Q', KEY_FLAG_IN_QUOTA), showflag(key, 'U', KEY_FLAG_USER_CONSTRUCT), - showflag(key, 'N', KEY_FLAG_NEGATIVE), + state < 0 ? 'N' : '-', showflag(key, 'i', KEY_FLAG_INVALIDATED), atomic_read(&key->usage), xbuf, diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 7dd050f24261..ac1d5b2b1626 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -727,7 +727,7 @@ try_again: ret = -EIO; if (!(lflags & KEY_LOOKUP_PARTIAL) && - !test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) + key_read_state(key) == KEY_IS_UNINSTANTIATED) goto invalid_key; /* check the permissions */ diff --git a/security/keys/request_key.c b/security/keys/request_key.c index c7a117c9a8f3..2ce733342b5a 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -594,10 +594,9 @@ int wait_for_key_construction(struct key *key, bool intr) intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (ret) return -ERESTARTSYS; - if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) { - smp_rmb(); - return key->reject_error; - } + ret = key_read_state(key); + if (ret < 0) + return ret; return key_validate(key); } EXPORT_SYMBOL(wait_for_key_construction); diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index 4f0f112fe276..217775fcd0f3 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -73,7 +73,7 @@ static void request_key_auth_describe(const struct key *key, seq_puts(m, "key:"); seq_puts(m, key->description); - if (key_is_instantiated(key)) + if (key_is_positive(key)) seq_printf(m, " pid:%d ci:%zu", rka->pid, rka->callout_len); } diff --git a/security/keys/trusted.c b/security/keys/trusted.c index 16dec53184b6..509aedcf8310 100644 --- a/security/keys/trusted.c +++ b/security/keys/trusted.c @@ -1014,7 +1014,7 @@ static int trusted_update(struct key *key, struct key_preparsed_payload *prep) char *datablob; int ret = 0; - if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) + if (key_is_negative(key)) return -ENOKEY; p = key->payload.data[0]; if (!p->migratable) diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c index 8705d79b2c6f..eba8a516ee9e 100644 --- a/security/keys/user_defined.c +++ b/security/keys/user_defined.c @@ -120,7 +120,7 @@ int user_update(struct key *key, struct key_preparsed_payload *prep) if (ret == 0) { /* attach the new data, displacing the old */ - if (!test_bit(KEY_FLAG_NEGATIVE, &key->flags)) + if (key_is_positive(key)) zap = key->payload.data[0]; else zap = NULL; @@ -174,7 +174,7 @@ EXPORT_SYMBOL_GPL(user_destroy); void user_describe(const struct key *key, struct seq_file *m) { seq_puts(m, key->description); - if (key_is_instantiated(key)) + if (key_is_positive(key)) seq_printf(m, ": %u", key->datalen); } -- cgit v1.2.3 From 1bb1d4252d1ede47afea054979fb9d95fc891743 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 8 Sep 2016 14:20:38 -0700 Subject: fscrypto: require write access to mount to set encryption policy commit ba63f23d69a3a10e7e527a02702023da68ef8a6d upstream. [Please apply to 4.4-stable. Note: this was already backported, but only to ext4; it was missed that it should go to f2fs as well. This is needed to make xfstest generic/395 pass on f2fs.] Since setting an encryption policy requires writing metadata to the filesystem, it should be guarded by mnt_want_write/mnt_drop_write. Otherwise, a user could cause a write to a frozen or readonly filesystem. This was handled correctly by f2fs but not by ext4. Make fscrypt_process_policy() handle it rather than relying on the filesystem to get it right. Signed-off-by: Eric Biggers Signed-off-by: Theodore Ts'o Acked-by: Jaegeuk Kim Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/file.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4b449d263333..01eed94b01ea 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1541,12 +1541,18 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) sizeof(policy))) return -EFAULT; + err = mnt_want_write_file(filp); + if (err) + return err; + mutex_lock(&inode->i_mutex); err = f2fs_process_policy(&policy, inode); mutex_unlock(&inode->i_mutex); + mnt_drop_write_file(filp); + return err; #else return -EOPNOTSUPP; -- cgit v1.2.3 From aa3a0a70bdb8745864e41fca5f7722dfb3908d85 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 9 Oct 2017 12:40:00 -0700 Subject: FS-Cache: fix dereference of NULL user_key_payload commit d124b2c53c7bee6569d2a2d0b18b4a1afde00134 upstream. When the file /proc/fs/fscache/objects (available with CONFIG_FSCACHE_OBJECT_LIST=y) is opened, we request a user key with description "fscache:objlist", then access its payload. However, a revoked key has a NULL payload, and we failed to check for this. request_key() *does* skip revoked keys, but there is still a window where the key can be revoked before we access its payload. Fix it by checking for a NULL payload, treating it like a key which was already revoked at the time it was requested. Fixes: 4fbf4291aa15 ("FS-Cache: Allow the current state of all objects to be dumped") Reviewed-by: James Morris Signed-off-by: Eric Biggers Signed-off-by: David Howells Signed-off-by: Greg Kroah-Hartman --- fs/fscache/object-list.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 6b028b7c4250..926580a85153 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -330,6 +330,13 @@ static void fscache_objlist_config(struct fscache_objlist_data *data) rcu_read_lock(); confkey = user_key_payload(key); + if (!confkey) { + /* key was revoked */ + rcu_read_unlock(); + key_put(key); + goto no_config; + } + buf = confkey->data; for (len = confkey->datalen - 1; len >= 0; len--) { -- cgit v1.2.3 From 9b36699635c54b2e56ec3fc07a750dc465542a6d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 27 Oct 2017 10:23:18 +0200 Subject: Linux 4.4.95 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ff9d6bbf2210..57e1ea2a189a 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 94 +SUBLEVEL = 95 EXTRAVERSION = NAME = Blurry Fish Butt -- cgit v1.2.3 From fac311be26e5af64612c386f5a041984fe7c59a2 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 25 Apr 2017 10:37:58 +0100 Subject: cpufreq/sched: Consider max cpu capacity when choosing frequencies When using schedfreq on cpus with max capacity significantly smaller than 1024, the tick update uses non-normalised capacities - this leads to selecting an incorrect OPP as we were scaling the frequency as if the max capacity achievable was 1024 rather than the max for that particular cpu or group. This could result in a cpu being stuck at the lowest OPP and unable to generate enough utilisation to climb out if the max capacity is significantly smaller than 1024. Instead, normalize the capacity to be in the range 0-1024 in the tick so that when we later select a frequency, we get the correct one. Also comments updated to be clearer about what is needed. Change-Id: Id84391c7ac015311002ada21813a353ee13bee60 Signed-off-by: Chris Redpath --- kernel/sched/core.c | 4 ++++ kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 4 ++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 83f7c682032b..9cf530a6123e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2987,7 +2987,9 @@ static void sched_freq_tick_pelt(int cpu) * utilization and to harm its performance the least, request * a jump to a higher OPP as soon as the margin of free capacity * is impacted (specified by capacity_margin). + * Remember CPU utilization in sched_capacity_reqs should be normalised. */ + cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, cpu_utilization); } @@ -3014,7 +3016,9 @@ static void sched_freq_tick_walt(int cpu) * It is likely that the load is growing so we * keep the added margin in our request as an * extra boost. + * Remember CPU utilization in sched_capacity_reqs should be normalised. */ + cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, cpu_utilization); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5cac6a77b2bc..e6b2461d07d6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4671,7 +4671,7 @@ static void update_capacity_of(int cpu) if (!sched_freq()) return; - /* Convert scale-invariant capacity to cpu. */ + /* Normalize scale-invariant capacity to cpu. */ req_cap = boosted_cpu_util(cpu); req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, req_cap); @@ -4864,7 +4864,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (rq->cfs.nr_running) update_capacity_of(cpu_of(rq)); else if (sched_freq()) - set_cfs_cpu_capacity(cpu_of(rq), false, 0); + set_cfs_cpu_capacity(cpu_of(rq), false, 0); /* no normalization required for 0 */ } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9f3d89faacdc..5256f05a26e8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1621,6 +1621,10 @@ static inline bool sched_freq(void) return static_key_false(&__sched_freq); } +/* + * sched_capacity_reqs expects capacity requests to be normalised. + * All capacities should sum to the range of 0-1024. + */ DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); void update_cpu_capacity_request(int cpu, bool request); -- cgit v1.2.3 From 792510d9b392b392c763c5e395f046d5c246c66e Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 30 May 2017 14:51:53 +0100 Subject: BACKPORT: sched/fair: Make it possible to account fair load avg consistently While set_task_rq_fair() is introduced in mainline by commit ad936d8658fd ("sched/fair: Make it possible to account fair load avg consistently"), the function results to be introduced here by the backport of commit 09a43ace1f98 ("sched/fair: Propagate load during synchronous attach/detach"). The problem (apart from the confusion introduced by the backport) is actually that set_task_rq_fair() is currently not called at all. Fix the problem by backporting again commit ad936d8658fd ("sched/fair: Make it possible to account fair load avg consistently"). Original change log: The current code accounts for the time a task was absent from the fair class (per ATTACH_AGE_LOAD). However it does not work correctly when a task got migrated or moved to another cgroup while outside of the fair class. This patch tries to address that by aging on migration. We locklessly read the 'last_update_time' stamp from both the old and new cfs_rq, ages the load upto the old time, and sets it to the new time. These timestamps should in general not be more than 1 tick apart from one another, so there is a definite bound on things. Signed-off-by: Byungchul Park [ Changelog, a few edits and !SMP build fix ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1445616981-29904-2-git-send-email-byungchul.park@lge.com Signed-off-by: Ingo Molnar (cherry-picked from ad936d8658fd348338cb7d42c577dac77892b074) Signed-off-by: Juri Lelli Signed-off-by: Chris Redpath Change-Id: I17294ab0ada3901d35895014715fd60952949358 Signed-off-by: Brendan Jackman --- kernel/sched/core.c | 4 ++++ kernel/sched/sched.h | 11 ++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9cf530a6123e..e3242eed60e5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2173,6 +2173,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_LIST_HEAD(&p->se.group_node); walt_init_new_task_load(p); +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = NULL; +#endif + #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5256f05a26e8..1d52ca8a613c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -335,7 +335,15 @@ extern void sched_move_task(struct task_struct *tsk); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); -#endif + +#ifdef CONFIG_SMP +extern void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next); +#else /* !CONFIG_SMP */ +static inline void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) { } +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_CGROUP_SCHED */ @@ -987,6 +995,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_FAIR_GROUP_SCHED + set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); p->se.cfs_rq = tg->cfs_rq[cpu]; p->se.parent = tg->se[cpu]; #endif -- cgit v1.2.3 From 6b02ab68ec78f41f647ebc32b1c37bd27fdb034e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jun 2016 18:51:48 +0200 Subject: UPSTREAM: sched/fair: Fix and optimize the fork() path The task_fork_fair() callback already calls __set_task_cpu() and takes rq->lock. If we move the sched_class::task_fork callback in sched_fork() under the existing p->pi_lock, right after its set_task_cpu() call, we can avoid doing two such calls and omit the IRQ disabling on the rq->lock. Change to __set_task_cpu() to skip the migration bits, this is a new task, not a migration. Similarly, make wake_up_new_task() use __set_task_cpu() for the same reason, the task hasn't actually migrated as it hasn't ever ran. This cures the problem of calling migrate_task_rq_fair(), which does remove_entity_from_load_avg() on tasks that have never been added to the load avg to begin with. This bug would result in transiently messed up load_avg values, averaged out after a few dozen milliseconds. This is probably the reason why this bug was not found for such a long time. Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit e210bffd39d01b649c94b820c28ff112673266dd) Change-Id: Icbddbaa6e8c1071859673d8685bc3f38955cf144 Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath --- kernel/sched/core.c | 16 +++++++++++----- kernel/sched/fair.c | 27 ++++++--------------------- 2 files changed, 17 insertions(+), 26 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e3242eed60e5..7e696bb3291a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2300,9 +2300,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; } - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); - /* * The child is not yet in the pid-hash so no cgroup attach races, * and the cgroup is pinned to this child due to cgroup_fork() @@ -2311,7 +2308,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); - set_task_cpu(p, cpu); + /* + * We're setting the cpu for the first time, we don't migrate, + * so use __set_task_cpu(). + */ + __set_task_cpu(p, cpu); + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); #ifdef CONFIG_SCHED_INFO @@ -2453,8 +2456,11 @@ void wake_up_new_task(struct task_struct *p) * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug + * + * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, + * as we're not fully set-up yet. */ - set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p); post_init_entity_util_avg(&p->se); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e6b2461d07d6..d42a54f7b2e3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4712,7 +4712,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running increment below. - */ + */ if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; @@ -9901,31 +9901,17 @@ static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; - int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock(&rq->lock); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - - /* - * Not only the cpu but also the task_group of the parent might have - * been changed after parent->se.parent,cfs_rq were copied to - * child->se.parent,cfs_rq. So call __set_task_cpu() to make those - * of child point to valid ones. - */ - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - - update_curr(cfs_rq); - - if (curr) + if (curr) { + update_curr(cfs_rq); se->vruntime = curr->vruntime; + } place_entity(cfs_rq, se, 1); if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { @@ -9938,8 +9924,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); } /* -- cgit v1.2.3 From 138a670d97ca84a4cab83515a0920d4ef8eeb22a Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 17 Jun 2016 13:38:55 +0200 Subject: BACKPORT: sched/cgroup: Fix cpu_cgroup_fork() handling A new fair task is detached and attached from/to task_group with: cgroup_post_fork() ss->fork(child) := cpu_cgroup_fork() sched_move_task() task_move_group_fair() Which is wrong, because at this point in fork() the task isn't fully initialized and it cannot 'move' to another group, because its not attached to any group as yet. In fact, cpu_cgroup_fork() needs a small part of sched_move_task() so we can just call this small part directly instead sched_move_task(). And the task doesn't really migrate because it is not yet attached so we need the following sequence: do_fork() sched_fork() __set_task_cpu() cgroup_post_fork() set_task_rq() # set task group and runqueue wake_up_new_task() select_task_rq() can select a new cpu __set_task_cpu post_init_entity_util_avg attach_task_cfs_rq() activate_task enqueue_task This patch makes that happen. BACKPORT: Difference from original commit: - Removed use of DEQUEUE_MOVE (which isn't defined in 4.4) in dequeue_task flags - Replaced "struct rq_flags rf" with "unsigned long flags". Signed-off-by: Vincent Guittot [ Added TASK_SET_GROUP to set depth properly. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit ea86cb4b7621e1298a37197005bf0abcc86348d4) Change-Id: I8126fd923288acf961218431ffd29d6bf6fd8d72 Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath --- kernel/sched/core.c | 63 ++++++++++++++++++++++++++++++++++------------------ kernel/sched/fair.c | 23 ++++++++++++++++++- kernel/sched/sched.h | 5 ++++- 3 files changed, 67 insertions(+), 24 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7e696bb3291a..0b5c588929e9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8188,27 +8188,9 @@ void sched_offline_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -/* change task's runqueue when it moves between groups. - * The caller of this function should have put the task in its new group - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - * reflect its new group. - */ -void sched_move_task(struct task_struct *tsk) +static void sched_change_group(struct task_struct *tsk, int type) { struct task_group *tg; - int queued, running; - unsigned long flags; - struct rq *rq; - - rq = task_rq_lock(tsk, &flags); - - running = task_current(rq, tsk); - queued = task_on_rq_queued(tsk); - - if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE); - if (unlikely(running)) - put_prev_task(rq, tsk); /* * All callers are synchronized by task_rq_lock(); we do not use RCU @@ -8221,11 +8203,37 @@ void sched_move_task(struct task_struct *tsk) tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk); + if (tsk->sched_class->task_change_group) + tsk->sched_class->task_change_group(tsk, type); else #endif set_task_rq(tsk, task_cpu(tsk)); +} + +/* + * Change task's runqueue when it moves between groups. + * + * The caller of this function should have put the task in its new group by + * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect + * its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ + int queued, running; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(tsk, &flags); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + if (queued) + dequeue_task(rq, tsk, DEQUEUE_SAVE); + if (unlikely(running)) + put_prev_task(rq, tsk); + + sched_change_group(tsk, TASK_MOVE_GROUP); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); @@ -8662,9 +8670,20 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_free_group(tg); } +/* + * This is called before wake_up_new_task(), therefore we really only + * have to set its group bits, all the other stuff does not apply. + */ static void cpu_cgroup_fork(struct task_struct *task, void *private) { - sched_move_task(task); + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(task, &flags); + + sched_change_group(task, TASK_SET_GROUP); + + task_rq_unlock(rq, task, &flags); } static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d42a54f7b2e3..2afd81bfda99 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10116,6 +10116,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED +static void task_set_group_fair(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + + set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; +} + static void task_move_group_fair(struct task_struct *p) { detach_task_cfs_rq(p); @@ -10128,6 +10136,19 @@ static void task_move_group_fair(struct task_struct *p) attach_task_cfs_rq(p); } +static void task_change_group_fair(struct task_struct *p, int type) +{ + switch (type) { + case TASK_SET_GROUP: + task_set_group_fair(p); + break; + + case TASK_MOVE_GROUP: + task_move_group_fair(p); + break; + } +} + void free_fair_sched_group(struct task_group *tg) { int i; @@ -10354,7 +10375,7 @@ const struct sched_class fair_sched_class = { .update_curr = update_curr_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .task_move_group = task_move_group_fair, + .task_change_group = task_change_group_fair, #endif }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1d52ca8a613c..8d3712107e61 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1293,8 +1293,11 @@ struct sched_class { void (*update_curr) (struct rq *rq); +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 + #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_move_group) (struct task_struct *p); + void (*task_change_group)(struct task_struct *p, int type); #endif }; -- cgit v1.2.3 From 97cb74f48599ca1ae6c17955882f167a4c3aaad2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jun 2016 13:29:28 +0200 Subject: BACKPORT: sched/fair: Fix PELT integrity for new tasks Vincent and Yuyang found another few scenarios in which entity tracking goes wobbly. The scenarios are basically due to the fact that new tasks are not immediately attached and thereby differ from the normal situation -- a task is always attached to a cfs_rq load average (such that it includes its blocked contribution) and are explicitly detached/attached on migration to another cfs_rq. Scenario 1: switch to fair class p->sched_class = fair_class; if (queued) enqueue_task(p); ... enqueue_entity() enqueue_entity_load_avg() migrated = !sa->last_update_time (true) if (migrated) attach_entity_load_avg() check_class_changed() switched_from() (!fair) switched_to() (fair) switched_to_fair() attach_entity_load_avg() If @p is a new task that hasn't been fair before, it will have !last_update_time and, per the above, end up in attach_entity_load_avg() _twice_. Scenario 2: change between cgroups sched_move_group(p) if (queued) dequeue_task() task_move_group_fair() detach_task_cfs_rq() detach_entity_load_avg() set_task_rq() attach_task_cfs_rq() attach_entity_load_avg() if (queued) enqueue_task(); ... enqueue_entity() enqueue_entity_load_avg() migrated = !sa->last_update_time (true) if (migrated) attach_entity_load_avg() Similar as with scenario 1, if @p is a new task, it will have !load_update_time and we'll end up in attach_entity_load_avg() _twice_. Furthermore, notice how we do a detach_entity_load_avg() on something that wasn't attached to begin with. As stated above; the problem is that the new task isn't yet attached to the load tracking and thereby violates the invariant assumption. This patch remedies this by ensuring a new task is indeed properly attached to the load tracking on creation, through post_init_entity_util_avg(). Of course, this isn't entirely as straightforward as one might think, since the task is hashed before we call wake_up_new_task() and thus can be poked at. We avoid this by adding TASK_NEW and teaching cpu_cgroup_can_attach() to refuse such tasks. .:: BACKPORT Complicated by the fact that mch of the lines changed by the original of this commit were then changed by: df217913e72e sched/fair: Factorize attach/detach entity and then d31b1a66cbe0 sched/fair: Factorize PELT update , which have both already been backported here. Reported-by: Yuyang Du Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit 7dc603c9028ea5d4354e0e317e8481df99b06d7e) Change-Id: Ibc59eb52310a62709d49a744bd5a24e8b97c4ae8 Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath --- include/linux/sched.h | 5 +++-- kernel/sched/core.c | 26 +++++++++++++++++++++++--- kernel/sched/fair.c | 15 ++++++++++----- 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d0d0b1b45418..1872d19d1702 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -222,9 +222,10 @@ extern void proc_sched_set_task(struct task_struct *p); #define TASK_WAKING 256 #define TASK_PARKED 512 #define TASK_NOLOAD 1024 -#define TASK_STATE_MAX 2048 +#define TASK_NEW 2048 +#define TASK_STATE_MAX 4096 -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN" +#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0b5c588929e9..31cb76a915a8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2259,11 +2259,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) __sched_fork(clone_flags, p); /* - * We mark the process as running here. This guarantees that + * We mark the process as NEW here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ - p->state = TASK_RUNNING; + p->state = TASK_NEW; /* * Make sure we do not leak PI boosting priority to the child. @@ -2300,6 +2300,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; } + init_entity_runnable_average(&p->se); + /* * The child is not yet in the pid-hash so no cgroup attach races, * and the cgroup is pinned to this child due to cgroup_fork() @@ -2446,6 +2448,7 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, flags); + p->state = TASK_RUNNING; walt_init_new_task_load(p); @@ -8690,6 +8693,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; + int ret = 0; cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED @@ -8700,8 +8704,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (task->sched_class != &fair_sched_class) return -EINVAL; #endif + /* + * Serialize against wake_up_new_task() such that if its + * running, we're sure to observe its full state. + */ + raw_spin_lock_irq(&task->pi_lock); + /* + * Avoid calling sched_move_task() before wake_up_new_task() + * has happened. This would lead to problems with PELT, due to + * move wanting to detach+attach while we're not attached yet. + */ + if (task->state == TASK_NEW) + ret = -EINVAL; + raw_spin_unlock_irq(&task->pi_lock); + + if (ret) + break; } - return 0; + return ret; } static void cpu_cgroup_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2afd81bfda99..76af6e25e82e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -766,7 +766,9 @@ void init_entity_runnable_average(struct sched_entity *se) } static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); static void attach_entity_cfs_rq(struct sched_entity *se); +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); /* * With new tasks being created, their initial util_avgs are extrapolated @@ -837,7 +839,7 @@ void post_init_entity_util_avg(struct sched_entity *se) attach_entity_cfs_rq(se); } -#else +#else /* !CONFIG_SMP */ void init_entity_runnable_average(struct sched_entity *se) { } @@ -3312,11 +3314,14 @@ void remove_entity_load_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Newly created task or never used group entity should not be removed - * from its (source) cfs_rq + * tasks cannot exit without having gone through wake_up_new_task() -> + * post_init_entity_util_avg() which will have added things to the + * cfs_rq, so we can remove unconditionally. + * + * Similarly for groups, they will have passed through + * post_init_entity_util_avg() before unregister_sched_fair_group() + * calls this. */ - if (se->avg.last_update_time == 0) - return; sync_entity_load_avg(se); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); -- cgit v1.2.3 From c14c9b6e3e489062efe08c364196e76e705b8ea3 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 8 Dec 2016 17:56:53 +0100 Subject: UPSTREAM: sched/core: Fix find_idlest_group() for fork During fork, the utilization of a task is init once the rq has been selected because the current utilization level of the rq is used to set the utilization of the fork task. As the task's utilization is still 0 at this step of the fork sequence, it doesn't make sense to look for some spare capacity that can fit the task's utilization. Furthermore, I can see perf regressions for the test: hackbench -P -g 1 because the least loaded policy is always bypassed and tasks are not spread during fork. With this patch and the fix below, we are back to same performances as for v4.8. The fix below is only a temporary one used for the test until a smarter solution is found because we can't simply remove the test which is useful for others benchmarks | @@ -5708,13 +5708,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t | | avg_cost = this_sd->avg_scan_cost; | | - /* | - * Due to large variance we need a large fuzz factor; hackbench in | - * particularly is sensitive here. | - */ | - if ((avg_idle / 512) < avg_cost) | - return -1; | - | time = local_clock(); | | for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) { Tested-by: Matt Fleming Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Matt Fleming Acked-by: Morten Rasmussen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: kernellwp@gmail.com Cc: umgwanakikbuti@gmail.com Cc: yuyang.du@intel.comc Link: http://lkml.kernel.org/r/1481216215-24651-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar (cherry picked from commit f519a3f1c6b7a990e5aed37a8f853c6ecfdee945) Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath Change-Id: I86cc2ad81af3467c0b2f82b995111f428248baa4 --- kernel/sched/fair.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 76af6e25e82e..4a20c392c0eb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6051,13 +6051,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, * utilized systems if we require spare_capacity > task_util(p), * so we allow for some task stuffing by using * spare_capacity > task_util(p)/2. + * + * Spare capacity can't be used for fork because the utilization has + * not been set yet, we must first select a rq to compute the initial + * utilization. */ + if (sd_flag & SD_BALANCE_FORK) + goto skip_spare; + if (this_spare > task_util(p) / 2 && imbalance*this_spare > 100*most_spare) return NULL; else if (most_spare > task_util(p) / 2) return most_spare_sg; +skip_spare: if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; -- cgit v1.2.3 From 4863faf5e4df76322999ad06502cbe27d0ec86dc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Oct 2016 16:20:59 +0200 Subject: UPSTREAM: sched/core: Add missing update_rq_clock() in post_init_entity_util_avg() Address this rq-clock update bug: WARNING: CPU: 0 PID: 0 at ../kernel/sched/sched.h:797 post_init_entity_util_avg() rq->clock_update_flags < RQCF_ACT_SKIP Call Trace: __warn() post_init_entity_util_avg() wake_up_new_task() _do_fork() kernel_thread() rest_init() start_kernel() Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit 4126bad6717336abe5d666440ae15555563ca53f) Change-Id: Ibe9a73386896377f96483d195e433259218755a5 Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 31cb76a915a8..7da9ce69e707 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2466,6 +2466,7 @@ void wake_up_new_task(struct task_struct *p) __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p); + update_rq_clock(rq); post_init_entity_util_avg(&p->se); walt_mark_task_starting(p); -- cgit v1.2.3 From bea1b621d952079c4cebc1178ea580c290d0446e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Oct 2016 16:28:37 +0200 Subject: UPSTREAM: sched/core: Add missing update_rq_clock() in detach_task_cfs_rq() Instead of adding the update_rq_clock() all the way at the bottom of the callstack, add one at the top, this to aid later effort to minimize update_rq_lock() calls. WARNING: CPU: 0 PID: 1 at ../kernel/sched/sched.h:797 detach_task_cfs_rq() rq->clock_update_flags < RQCF_ACT_SKIP Call Trace: dump_stack() __warn() warn_slowpath_fmt() detach_task_cfs_rq() switched_from_fair() __sched_setscheduler() _sched_setscheduler() sched_set_stop_task() cpu_stop_create() __smpboot_create_thread.part.2() smpboot_register_percpu_thread_cpumask() cpu_stop_init() do_one_initcall() ? print_cpu_info() kernel_init_freeable() ? rest_init() kernel_init() ret_from_fork() Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit 80f5c1b84baa8180c3c27b7e227429712cd967b6) Change-Id: Ibffde077d18eabec4c2984158bd9d6d73bd0fb96 Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7da9ce69e707..4097f9fa541b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3578,6 +3578,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) BUG_ON(prio > MAX_PRIO); rq = __task_rq_lock(p); + update_rq_clock(rq); /* * Idle task boosting is a nono in general. There is one @@ -4095,6 +4096,7 @@ recheck: * runqueue lock must be held. */ rq = task_rq_lock(p, &flags); + update_rq_clock(rq); /* * Changing the policy of the stop threads its a very bad idea @@ -8685,6 +8687,7 @@ static void cpu_cgroup_fork(struct task_struct *task, void *private) rq = task_rq_lock(task, &flags); + update_rq_clock(rq); sched_change_group(task, TASK_SET_GROUP); task_rq_unlock(rq, task, &flags); -- cgit v1.2.3 From bab39eb879251debe02e573f5453bd93ad5350bd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Oct 2016 16:35:32 +0200 Subject: UPSTREAM: sched/core: Add missing update_rq_clock() call for task_hot() Add the update_rq_clock() call at the top of the callstack instead of at the bottom where we find it missing, this to aid later effort to minimize the number of update_rq_lock() calls. WARNING: CPU: 30 PID: 194 at ../kernel/sched/sched.h:797 assert_clock_updated() rq->clock_update_flags < RQCF_ACT_SKIP Call Trace: dump_stack() __warn() warn_slowpath_fmt() assert_clock_updated.isra.63.part.64() can_migrate_task() load_balance() pick_next_task_fair() __schedule() schedule() worker_thread() kthread() Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit 3bed5e2166a5e433bf62162f3cd3c5174d335934) Change-Id: Ief5070dcce486535334dcb739ee16b989ea9df42 Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4a20c392c0eb..f6e730bcde27 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8998,6 +8998,7 @@ redo: more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); + update_rq_clock(busiest); /* * cur_ld_moved - load moved in current iteration @@ -9395,6 +9396,7 @@ static int active_load_balance_cpu_stop(void *data) }; schedstat_inc(sd, alb_count); + update_rq_clock(busiest_rq); p = detach_one_task(&env); if (p) { -- cgit v1.2.3 From fd4a95dab858fa1350cc94ce4b435b7295db3ed3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 3 Oct 2016 16:44:25 +0200 Subject: UPSTREAM: sched/core: Add missing update_rq_clock() call in set_user_nice() Address this rq-clock update bug: WARNING: CPU: 30 PID: 195 at ../kernel/sched/sched.h:797 set_next_entity() rq->clock_update_flags < RQCF_ACT_SKIP Call Trace: dump_stack() __warn() warn_slowpath_fmt() set_next_entity() ? _raw_spin_lock() set_curr_task_fair() set_user_nice.part.85() set_user_nice() create_worker() worker_thread() kthread() ret_from_fork() Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit 2fb8d36787affe26f3536c3d8ec094995a48037d) Change-Id: I53ba056e72820c7fadb3f022e4ee3b821c0de17d Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4097f9fa541b..ca9d72fa8e66 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3670,6 +3670,8 @@ void set_user_nice(struct task_struct *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); + update_rq_clock(rq); + /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected -- cgit v1.2.3 From 795a6867cfe1ea0bbe967d2b037ae856484556a4 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 7 Aug 2017 17:39:00 +0100 Subject: UPSTREAM: sched/fair: Force balancing on nohz balance if local group has capacity The "goto force_balance" here is intended to mitigate the fact that avg_load calculations can result in bad placement decisions when priority is asymmetrical. The original commit that adds it: fab476228ba3 ("sched: Force balancing on newidle balance if local group has capacity") explains: Under certain situations, such as a niced down task (i.e. nice = -15) in the presence of nr_cpus NICE0 tasks, the niced task lands on a sched group and kicks away other tasks because of its large weight. This leads to sub-optimal utilization of the machine. Even though the sched group has capacity, it does not pull tasks because sds.this_load >> sds.max_load, and f_b_g() returns NULL. A similar but inverted issue also affects ARM big.LITTLE (asymmetrical CPU capacity) systems - consider 8 always-running, same-priority tasks on a system with 4 "big" and 4 "little" CPUs. Suppose that 5 of them end up on the "big" CPUs (which will be represented by one sched_group in the DIE sched_domain) and 3 on the "little" (the other sched_group in DIE), leaving one CPU unused. Because the "big" group has a higher group_capacity its avg_load may not present an imbalance that would cause migrating a task to the idle "little". The force_balance case here solves the problem but currently only for CPU_NEWLY_IDLE balances, which in theory might never happen on the unused CPU. Including CPU_IDLE in the force_balance case means there's an upper bound on the time before we can attempt to solve the underutilization: after DIE's sd->balance_interval has passed the next nohz balance kick will help us out. Change-Id: I807ba5cba0ef1b8bbec02cbcd4755fd32af10135 Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Paul Turner Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170807163900.25180-1-brendan.jackman@arm.com Signed-off-by: Ingo Molnar (cherry-picked-from: commit 583ffd99d765 tip:sched/core) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f6e730bcde27..ae94c1124655 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8707,8 +8707,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + /* + * When dst_cpu is idle, prevent SMP nice and/or asymmetric group + * capacities from resulting in underutilization due to avg_load. + */ + if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && busiest->group_no_capacity) goto force_balance; -- cgit v1.2.3 From 0f743ce7458c3c7d4be2367c8ed538069f1472b6 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 31 Aug 2017 12:57:58 +0100 Subject: BACKPORT: sched/fair: Move select_task_rq_fair slow-path into its own function In preparation for changes that would otherwise require adding a new level of indentation to the while(sd) loop, create a new function find_idlest_cpu() which contains this loop, and rename the existing find_idlest_cpu() to find_idlest_group_cpu(). Code inside the while(sd) loop is unchanged. @new_cpu is added as a variable in the new function, with the same initial value as the @new_cpu in select_task_rq_fair(). Change-Id: I9842308cab00dc9cd6c513fc38c609089a1aaaaf Suggested-by: Peter Zijlstra Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josef Bacik Reviewed-by: Vincent Guittot Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-2-brendan.jackman@arm.com Signed-off-by: Ingo Molnar (reworked for eas/cas schedstats added in Android) (cherry-picked commit 18bd1b4bd53a from tip:sched/core) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 114 ++++++++++++++++++++++++++++------------------------ 1 file changed, 62 insertions(+), 52 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ae94c1124655..561d1c505512 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6072,10 +6072,10 @@ skip_spare: } /* - * find_idlest_cpu - find the idlest cpu among the cpus in group. + * find_idlest_group_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -6122,6 +6122,65 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) } return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; + } + +static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, + int cpu, int prev_cpu, int sd_flag) +{ + int new_cpu = prev_cpu; + int wu = sd_flag & SD_BALANCE_WAKE; + int cas_cpu = -1; + + if (wu) { + schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts); + schedstat_inc(this_rq(), eas_stats.cas_attempts); + } + + while (sd) { + struct sched_group *group; + struct sched_domain *tmp; + int weight; + + if (wu) + schedstat_inc(sd, eas_stats.cas_attempts); + + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, p, cpu, sd_flag); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_group_cpu(group, p, cpu); + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = cas_cpu = new_cpu; + weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + if (wu && (cas_cpu >= 0)) { + schedstat_inc(p, se.statistics.nr_wakeups_cas_count); + schedstat_inc(this_rq(), eas_stats.cas_count); + } + + return new_cpu; } /* @@ -6698,56 +6757,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } else { - int wu = sd_flag & SD_BALANCE_WAKE; - int cas_cpu = -1; - - if (wu) { - schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts); - schedstat_inc(this_rq(), eas_stats.cas_attempts); - } - - while (sd) { - struct sched_group *group; - int weight; - - if (wu) - schedstat_inc(sd, eas_stats.cas_attempts); - - if (!(sd->flags & sd_flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, p, cpu, sd_flag); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = cas_cpu = new_cpu; - weight = sd->span_weight; - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= tmp->span_weight) - break; - if (tmp->flags & sd_flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ - } - - if (wu && (cas_cpu >= 0)) { - schedstat_inc(p, se.statistics.nr_wakeups_cas_count); - schedstat_inc(this_rq(), eas_stats.cas_count); - } + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } rcu_read_unlock(); -- cgit v1.2.3 From 529def2ffe532855f570a3a00e9f78ce59c8d84b Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 31 Aug 2017 12:57:59 +0100 Subject: UPSTREAM: sched/fair: Remove unnecessary comparison with -1 Since commit: 83a0a96a5f26 ("sched/fair: Leverage the idle state info when choosing the "idlest" cpu") find_idlest_group_cpu() (formerly find_idlest_cpu) no longer returns -1, so we can simplify the checking of the return value in find_idlest_cpu(). Change-Id: I98f4b9f178cd93a30408e024e608d36771764c7b Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josef Bacik Reviewed-by: Vincent Guittot Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-3-brendan.jackman@arm.com Signed-off-by: Ingo Molnar (cherry-picked-from commit e90381eaecf6 in tip:sched/core) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 561d1c505512..cf28d82fad41 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6156,7 +6156,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p } new_cpu = find_idlest_group_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { + if (new_cpu == cpu) { /* Now try balancing at a lower domain level of cpu */ sd = sd->child; continue; -- cgit v1.2.3 From 9c825cf6165c1662ce588a1fe11a912eaeaec928 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 31 Aug 2017 12:58:00 +0100 Subject: BACKPORT: sched/fair: Fix find_idlest_group when local group is not allowed When the local group is not allowed we do not modify this_*_load from their initial value of 0. That means that the load checks at the end of find_idlest_group cause us to incorrectly return NULL. Fixing the initial values to ULONG_MAX means we will instead return the idlest remote group in that case. BACKPORT: Note 4.4 is missing commit 6b94780e45c1 "sched/core: Use load_avg for selecting idlest group", so we only have to fix this_load instead of this_runnable_load and this_avg_load. Change-Id: I41f775b0e7c8f5e675c2780f955bb130a563cba7 Signed-off-by: Brendan Jackman Reviewed-by: Vincent Guittot Reviewed-by: Josef Bacik Cc: Dietmar Eggemann Cc: Vincent Guittot Cc: Josef Bacik Cc: Ingo Molnar Cc: Morten Rasmussen Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20171005114516.18617-4-brendan.jackman@arm.com Signed-off-by: Ingo Molnar (cherry-picked-from: commit 0d10ab952e99 tip:sched/core) (backport changes described above) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cf28d82fad41..dc685b67d08b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5983,7 +5983,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, { struct sched_group *idlest = NULL, *group = sd->groups; struct sched_group *most_spare_sg = NULL; - unsigned long min_load = ULONG_MAX, this_load = 0; + unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX; unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; -- cgit v1.2.3 From 411654764590c071ef55242ffd50c4e74930353b Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 31 Aug 2017 12:58:01 +0100 Subject: UPSTREAM: sched/fair: Fix usage of find_idlest_group() when no groups are allowed When 'p' is not allowed on any of the CPUs in the sched_domain, we currently return NULL from find_idlest_group(), and pointlessly continue the search on lower sched_domain levels (where 'p' is also not allowed) before returning prev_cpu regardless (as we have not updated new_cpu). Add an explicit check for this case, and add a comment to find_idlest_group(). Now when find_idlest_group() returns NULL, it always means that the local group is allowed and idlest. Change-Id: I5f2648d2f7fb0465677961ecb7473df3d06f0057 Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Josef Bacik Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-5-brendan.jackman@arm.com Signed-off-by: Ingo Molnar (cherry-picked-from: commit 6fee85ccbc76 tip:sched/core) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dc685b67d08b..0e3a9935515c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5976,6 +5976,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) /* * find_idlest_group finds and returns the least busy CPU group within the * domain. + * + * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, @@ -6136,6 +6138,9 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p schedstat_inc(this_rq(), eas_stats.cas_attempts); } + if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) + return prev_cpu; + while (sd) { struct sched_group *group; struct sched_domain *tmp; -- cgit v1.2.3 From 5a8663664915417aec806da2b0bc534d675f413d Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 31 Aug 2017 12:58:02 +0100 Subject: UPSTREAM: sched/fair: Fix usage of find_idlest_group() when the local group is idlest find_idlest_group() returns NULL when the local group is idlest. The caller then continues the find_idlest_group() search at a lower level of the current CPU's sched_domain hierarchy. find_idlest_group_cpu() is not consulted and, crucially, @new_cpu is not updated. This means the search is pointless and we return @prev_cpu from select_task_rq_fair(). This is fixed by initialising @new_cpu to @cpu instead of @prev_cpu. Change-Id: Ie531f5bb29775952bdc4c148b6e974b2f5f32b7a Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josef Bacik Reviewed-by: Vincent Guittot Cc: Dietmar Eggemann Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20171005114516.18617-6-brendan.jackman@arm.com Signed-off-by: Ingo Molnar (cherry-picked-from: commit 93f50f90247e tip:sched/core) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0e3a9935515c..bf7b7a7e778b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6129,7 +6129,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, int cpu, int prev_cpu, int sd_flag) { - int new_cpu = prev_cpu; + int new_cpu = cpu; int wu = sd_flag & SD_BALANCE_WAKE; int cas_cpu = -1; -- cgit v1.2.3 From 2f30db8df4076c54f92ac2ebe26734e27c164fd3 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Tue, 1 Aug 2017 15:48:37 +0100 Subject: UPSTREAM: sched/fair: Sync task util before slow-path wakeup We use task_util() in find_idlest_group() via capacity_spare_wake(). This task_util() updated in wake_cap(). However wake_cap() is not the only reason for ending up in find_idlest_group() - we could have been sent there by wake_wide(). So explicitly sync the task util with prev_cpu when we are about to head to find_idlest_group(). We could simply do this at the beginning of select_task_rq_fair() (i.e. irrespective of whether we're heading to select_idle_sibling() or find_idlest_group() & co), but I didn't want to slow down the select_idle_sibling() path more than necessary. Don't do this during fork balancing, we won't need the task_util and we'd just clobber the last_update_time, which is supposed to be 0. Change-Id: I935f4bfdfec3e8b914457aac3387ce264d5fd484 Signed-off-by: Brendan Jackman Signed-off-by: Peter Zijlstra (Intel) Cc: Andres Oportus Cc: Dietmar Eggemann Cc: Joel Fernandes Cc: Josef Bacik Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20170808095519.10077-1-brendan.jackman@arm.com Signed-off-by: Ingo Molnar (cherry-picked-from: commit ea16f0ea6c3d tip:sched/core) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bf7b7a7e778b..5d8081b77f8a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6757,6 +6757,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = cpu; } + if (sd && !(sd_flag & SD_BALANCE_FORK)) { + /* + * We're going to need the task's util for capacity_spare_wake + * in find_idlest_group. Sync it up to prev_cpu's + * last_update_time. + */ + sync_entity_load_avg(&p->se); + } + if (!sd) { if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); -- cgit v1.2.3 From 2aada289d7be37116f21d70177cb06a929b5d961 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 7 Sep 2017 12:24:45 +0100 Subject: sched/fair: trace energy_diff for non boosted tasks In systems where SchedTune is enabled, we do not report energy diff for non boosted tasks. Let's fix this by always genereting an energy_diff event where however: nrg.delta = 0, since we skip energy normalization payoff = nrg.diff, since the payoff is defined just by the energy difference Change-Id: I9a11ec19b6f56da04147f5ae5b47daf1dd180445 Signed-off-by: Patrick Bellasi Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5d8081b77f8a..a17820385ee2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5702,8 +5702,14 @@ energy_diff(struct energy_env *eenv) __energy_diff(eenv); /* Return energy diff when boost margin is 0 */ - if (boost == 0) + if (boost == 0) { + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + 0, -eenv->nrg.diff); return eenv->nrg.diff; + } /* Compute normalized energy diff */ nrg_delta = normalize_energy(eenv->nrg.diff); -- cgit v1.2.3 From 4edc5b0e387a906a00cb45af2abf76aa3a930438 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 7 Sep 2017 12:27:56 +0100 Subject: sched/fair: ignore backup CPU when not valid The find_best_target can sometimes not return a valid backup CPU, either because it cannot find one or just becasue it returns prev_cpu as a backup. In these cases we should skip the energy_diff evaluation for the backup CPU. Change-Id: I3787dbdfe74122348dd7a7485b88c4679051bd32 Signed-off-by: Patrick Bellasi Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a17820385ee2..417b373e4074 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6686,7 +6686,9 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync /* No energy saving for target_cpu, try backup */ target_cpu = tmp_backup; eenv.dst_cpu = target_cpu; - if (tmp_backup < 0 || energy_diff(&eenv) >= 0) { + if (tmp_backup < 0 || + tmp_backup == prev_cpu || + energy_diff(&eenv) >= 0) { schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); target_cpu = prev_cpu; -- cgit v1.2.3 From ca42e804464bc9cb81e2ad07fe689c9f8514d5fe Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 17 Jul 2017 15:54:39 +0100 Subject: sched/fair: enforce EAS mode For non latency sensitive tasks the goal is to optimize for energy efficiency. Thus, we should try our best to avoid moving a task on a CPU which is then going to be marked as overutilized. Let's use the capacity_margin metric to verify if a candidate target CPU should be considered without risking to bail out of EAS mode. Change-Id: Ib3697106f4073aedf4a6c6ce42bd5d000fa8c007 Signed-off-by: Patrick Bellasi Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 417b373e4074..0784864eeb30 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6469,6 +6469,19 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, continue; } + /* + * Enforce EAS mode + * + * For non latency sensitive tasks, skip CPUs that + * will be overutilized by moving the task there. + * + * The goal here is to remain in EAS mode as long as + * possible at least for !prefer_idle tasks. + */ + if ((new_util * capacity_margin) > + (capacity_orig * SCHED_CAPACITY_SCALE)) + continue; + /* * Case B) Non latency sensitive tasks on IDLE CPUs. * -- cgit v1.2.3 From 5f8b3a757d6561e5668cb09c75b856347263718b Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 12 Sep 2017 14:44:24 +0100 Subject: sched/fair: consider task utilization in group_norm_util() The group_norm_util() function is used to compute the normalized utilization of a SG given a certain energy_env configuration. The main client of this function is the energy_diff function when it comes to compute the SG energy for one of the before/after scheduling candidates. Currently, the energy_diff function sets util_delta = 0 when it wants to compute the energy corresponding to the scheduling candidate where the task runs in the previous CPU. This implies that, for the task waking up in the previous CPU we consider only its blocked load tracked by the CPU RQ. However, in case of a medium-big task which is waking up on a long time idle CPU, this blocked load can be already completely decayed. More in general, the current approach is biased towards under-estimating the energy consumption for the "before" scheduling candidate. This patch fixes this by: - always use the cpu_util_wake() to properly get the utilization of a CPU without any (partially decayed) contribution of the waking up task - adding the task utilization to the cpu_util_wake just for the target cpu The "target CPU" is defined by the energy_env to be either the src_cpu or the dst_cpu, depending on which scheduling candidate we are considering. This patch update also the definition of __cpu_norm_util(), which is currently called just by the group_norm_util() function. This allows to simplify the code by using this function just to normalize a specified utilization with respect to a given capacity. This update allows to completely remove any dependency of group_norm_util() from calc_util_delta(). Change-Id: I3b6ec50ce8decb1521faae660e326ab3319d3c82 Signed-off-by: Patrick Bellasi Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 58 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0784864eeb30..9575473f0cf6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5297,6 +5297,7 @@ struct energy_env { int util_delta; int src_cpu; int dst_cpu; + int trg_cpu; int energy; int payoff; struct task_struct *task; @@ -5313,11 +5314,14 @@ struct energy_env { } cap; }; +static int cpu_util_wake(int cpu, struct task_struct *p); + /* * __cpu_norm_util() returns the cpu util relative to a specific capacity, - * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for - * energy calculations. Using the scale-invariant util returned by - * cpu_util() and approximating scale-invariant util by: + * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for + * energy calculations. + * + * Since util is a scale-invariant utilization defined as: * * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time * @@ -5327,10 +5331,8 @@ struct energy_env { * * norm_util = running_time/time ~ util/capacity */ -static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta) +static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity) { - int util = __cpu_util(cpu, delta); - if (util >= capacity) return SCHED_CAPACITY_SCALE; @@ -5362,28 +5364,37 @@ unsigned long group_max_util(struct energy_env *eenv) /* * group_norm_util() returns the approximated group util relative to it's - * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in - * energy calculations. Since task executions may or may not overlap in time in - * the group the true normalized util is between max(cpu_norm_util(i)) and - * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The - * latter is used as the estimate as it leads to a more pessimistic energy + * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use + * in energy calculations. + * + * Since task executions may or may not overlap in time in the group the true + * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i)) + * when iterating over all CPUs in the group. + * The latter estimate is used as it leads to a more pessimistic energy * estimate (more busy). */ static unsigned long group_norm_util(struct energy_env *eenv, struct sched_group *sg) { - int i, delta; - unsigned long util_sum = 0; unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap; + unsigned long util, util_sum = 0; + int cpu; - for_each_cpu(i, sched_group_cpus(sg)) { - delta = calc_util_delta(eenv, i); - util_sum += __cpu_norm_util(i, capacity, delta); + for_each_cpu(cpu, sched_group_cpus(sg)) { + util = cpu_util_wake(cpu, eenv->task); + + /* + * If we are looking at the target CPU specified by the eenv, + * then we should add the (estimated) utilization of the task + * assuming we will wake it up on that CPU. + */ + if (unlikely(cpu == eenv->trg_cpu)) + util += eenv->util_delta; + + util_sum += __cpu_norm_util(util, capacity); } - if (util_sum > SCHED_CAPACITY_SCALE) - return SCHED_CAPACITY_SCALE; - return util_sum; + return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE); } static int find_new_capacity(struct energy_env *eenv, @@ -5575,6 +5586,8 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } +static inline unsigned long task_util(struct task_struct *p); + /* * energy_diff(): Estimate the energy impact of changing the utilization * distribution. eenv specifies the change: utilisation amount, source, and @@ -5590,11 +5603,13 @@ static inline int __energy_diff(struct energy_env *eenv) int diff, margin; struct energy_env eenv_before = { - .util_delta = 0, + .util_delta = task_util(eenv->task), .src_cpu = eenv->src_cpu, .dst_cpu = eenv->dst_cpu, + .trg_cpu = eenv->src_cpu, .nrg = { 0, 0, 0, 0}, .cap = { 0, 0, 0 }, + .task = eenv->task, }; if (eenv->src_cpu == eenv->dst_cpu) @@ -5972,8 +5987,6 @@ boosted_task_util(struct task_struct *task) return util + margin; } -static int cpu_util_wake(int cpu, struct task_struct *p); - static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) { return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); @@ -6681,6 +6694,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync .src_cpu = prev_cpu, .dst_cpu = target_cpu, .task = p, + .trg_cpu = target_cpu, }; -- cgit v1.2.3 From 3c71cbb896fe15de4f365223af45096c4098c309 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 1 Jun 2017 16:40:22 +0100 Subject: sched/fair: consider task utilization in group_max_util() The group_max_util() function is used to compute the maximum utilization across the CPUs of a certain energy_env configuration. Its main client is the energy_diff function when it needs to compute the SG capacity for one of the before/after scheduling candidates. Currently, the energy_diff function sets util_delta = 0 when it wants to compute the energy corresponding to the scheduling candidate where the task runs in the previous CPU. This implies that, for the task waking up in the previous CPU we consider only its blocked load tracked by the CPU RQ. However, in case of a medium-big task which is waking up on a long time idle CPU, this blocked load can be already completely decayed. More in general, the current approach is biased towards under-estimating the capacity requirements for the "before" scheduling candidate. This patch fixes this by: - always use the cpu_util_wake() to properly get the utilization of a CPU without any (partially decayed) contribution of the waking up task - adding the task utilization to the cpu_util_wake just for the target cpu The "target CPU" is defined by the energy_env to be either the src_cpu or the dst_cpu, depending on which scheduling candidate we are considering. Finally, since this update removes the last usage of calc_util_delta() this function is now safely removed. Change-Id: I20ee1bcf40cee6bf6e265fb2d32ef79061ad6ced Signed-off-by: Patrick Bellasi Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9575473f0cf6..e9919aeacdd5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5339,24 +5339,24 @@ static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity) return (util << SCHED_CAPACITY_SHIFT)/capacity; } -static int calc_util_delta(struct energy_env *eenv, int cpu) +static unsigned long group_max_util(struct energy_env *eenv) { - if (cpu == eenv->src_cpu) - return -eenv->util_delta; - if (cpu == eenv->dst_cpu) - return eenv->util_delta; - return 0; -} - -static -unsigned long group_max_util(struct energy_env *eenv) -{ - int i, delta; unsigned long max_util = 0; + unsigned long util; + int cpu; + + for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) { + util = cpu_util_wake(cpu, eenv->task); + + /* + * If we are looking at the target CPU specified by the eenv, + * then we should add the (estimated) utilization of the task + * assuming we will wake it up on that CPU. + */ + if (unlikely(cpu == eenv->trg_cpu)) + util += eenv->util_delta; - for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) { - delta = calc_util_delta(eenv, i); - max_util = max(max_util, __cpu_util(i, delta)); + max_util = max(max_util, util); } return max_util; -- cgit v1.2.3 From e3ba92c160d3fb21af5b28a21c63851fad21b168 Mon Sep 17 00:00:00 2001 From: Russ Weight Date: Thu, 8 Jun 2017 11:38:59 -0700 Subject: sched/tune: access schedtune_initialized under CGROUP_SCHEDTUNE schedtune_initialized is protected by CONFIG_CGROUP_SCHEDTUNE, but is being used without CONFIG_CGROUP_SCHEDTUNE being defined. Add appropriate ifdefs around the usage of schedtune_initialized to avoid a compilation error when CONFIG_CGROUP_SCHEDTUNE is not defined. Change-Id: Iab79bf053d74db3eeb84c09d71d43b4e39746ed2 Signed-off-by: Russ Weight Signed-off-by: Fei Yang --- kernel/sched/fair.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e9919aeacdd5..6023d9e3a9f5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5668,7 +5668,11 @@ static inline int __energy_diff(struct energy_env *eenv) #ifdef CONFIG_SCHED_TUNE struct target_nrg schedtune_target_nrg; + +#ifdef CONFIG_CGROUP_SCHEDTUNE extern bool schedtune_initialized; +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + /* * System energy normalization * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE], @@ -5679,9 +5683,11 @@ normalize_energy(int energy_diff) { u32 normalized_nrg; +#ifdef CONFIG_CGROUP_SCHEDTUNE /* during early setup, we don't know the extents */ if (unlikely(!schedtune_initialized)) return energy_diff < 0 ? -1 : 1 ; +#endif /* CONFIG_CGROUP_SCHEDTUNE */ #ifdef CONFIG_SCHED_DEBUG { -- cgit v1.2.3 From effc721b3c9b3bd258313450cdfd3d0e644f4d85 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Mon, 7 Aug 2017 18:14:37 +0800 Subject: sched/fair: remove useless variable in find_best_target Patch 5680f23f20c7 ("sched/fair: streamline find_best_target heuristics") has reworked function find_best_target, as result the variable "target_util" is useless now. So remove it. Change-Id: I5447062419e5828a49115119984fac6cd37db034 Signed-off-by: Leo Yan --- kernel/sched/fair.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6023d9e3a9f5..9bc717ef81f5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6350,7 +6350,6 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, unsigned long target_capacity = ULONG_MAX; unsigned long min_wake_util = ULONG_MAX; unsigned long target_max_spare_cap = 0; - unsigned long target_util = ULONG_MAX; unsigned long best_active_util = ULONG_MAX; int best_idle_cstate = INT_MAX; struct sched_domain *sd; @@ -6579,7 +6578,6 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, target_max_spare_cap = capacity_orig - new_util; target_capacity = capacity_orig; - target_util = new_util; target_cpu = i; } -- cgit v1.2.3 From 43bd960dfe728284e1059f11d6c686d23887c1c6 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Fri, 3 Feb 2017 11:15:31 -0800 Subject: sched: WALT: account cumulative window demand Energy cost estimation has been a long lasting challenge for WALT because WALT guides CPU frequency based on the CPU utilization of previous window. Consequently it's not possible to know newly waking-up task's energy cost until WALT's end of the current window. The WALT already tracks 'Previous Runnable Sum' (prev_runnable_sum) and 'Cumulative Runnable Average' (cr_avg). They are designed for CPU frequency guidance and task placement but unfortunately both are not suitable for the energy cost estimation. It's because using prev_runnable_sum for energy cost calculation would make us to account CPU and task's energy solely based on activity in the previous window so for example, any task didn't have an activity in the previous window will be accounted as a 'zero energy cost' task. Energy estimation with cr_avg is what energy_diff() relies on at present. However cr_avg can only represent instantaneous picture of energy cost thus for example, if a CPU was fully occupied for an entire WALT window and became idle just before window boundary, and if there is a wake-up, energy_diff() accounts that CPU is a 'zero energy cost' CPU. As a result, introduce a new accounting unit 'Cumulative Window Demand'. The cumulative window demand tracks all the tasks' demands have seen in current window which is neither instantaneous nor actual execution time. Because task demand represents estimated scaled execution time when the task runs a full window, accumulation of all the demands represents predicted CPU load at the end of window. Thus we can estimate CPU's frequency at the end of current WALT window with the cumulative window demand. The use of prev_runnable_sum for the CPU frequency guidance and cr_avg for the task placement have not changed and these are going to be used for both purpose while this patch aims to add an additional statistics. Change-Id: I9908c77ead9973a26dea2b36c001c2baf944d4f5 Signed-off-by: Joonwoo Park --- include/linux/sched.h | 1 + kernel/sched/core.c | 8 ++++++++ kernel/sched/sched.h | 12 +++++++++++ kernel/sched/walt.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 74 insertions(+), 3 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 1872d19d1702..62e179f84aef 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1568,6 +1568,7 @@ struct task_struct { * of this task */ u32 init_load_pct; + u64 last_sleep_ts; #endif #ifdef CONFIG_CGROUP_SCHED diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca9d72fa8e66..563f316f5330 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2170,6 +2170,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; +#ifdef CONFIG_SCHED_WALT + p->last_sleep_ts = 0; +#endif + INIT_LIST_HEAD(&p->se.group_node); walt_init_new_task_load(p); @@ -3379,6 +3383,10 @@ static void __sched notrace __schedule(bool preempt) rq->clock_skip_update = 0; if (likely(prev != next)) { +#ifdef CONFIG_SCHED_WALT + if (!prev->on_rq) + prev->last_sleep_ts = wallclock; +#endif rq->nr_switches++; rq->curr = next; ++*switch_count; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8d3712107e61..deba080af845 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -690,6 +690,7 @@ struct rq { u64 cur_irqload; u64 avg_irqload; u64 irqload_ts; + u64 cum_window_demand; #endif /* CONFIG_SCHED_WALT */ @@ -2101,6 +2102,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} #endif /* CONFIG_CPU_FREQ */ +#ifdef CONFIG_SCHED_WALT + +static inline bool +walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p) +{ + return cpu_of(rq) == task_cpu(p) && + (p->on_rq || p->last_sleep_ts >= rq->window_start); +} + +#endif /* CONFIG_SCHED_WALT */ + #ifdef arch_scale_freq_capacity #ifndef arch_scale_freq_invariant #define arch_scale_freq_invariant() (true) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 441cba01bc04..93f61486f989 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -70,11 +70,28 @@ static unsigned int task_load(struct task_struct *p) return p->ravg.demand; } +static inline void fixup_cum_window_demand(struct rq *rq, s64 delta) +{ + rq->cum_window_demand += delta; + if (unlikely((s64)rq->cum_window_demand < 0)) + rq->cum_window_demand = 0; +} + void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { rq->cumulative_runnable_avg += p->ravg.demand; + + /* + * Add a task's contribution to the cumulative window demand when + * + * (1) task is enqueued with on_rq = 1 i.e migration, + * prio/cgroup/class change. + * (2) task is waking for the first time in this window. + */ + if (p->on_rq || (p->last_sleep_ts < rq->window_start)) + fixup_cum_window_demand(rq, p->ravg.demand); } void @@ -83,6 +100,14 @@ walt_dec_cumulative_runnable_avg(struct rq *rq, { rq->cumulative_runnable_avg -= p->ravg.demand; BUG_ON((s64)rq->cumulative_runnable_avg < 0); + + /* + * on_rq will be 1 for sleeping tasks. So check if the task + * is migrating or dequeuing in RUNNING state to change the + * prio/cgroup/class. + */ + if (task_on_rq_migrating(p) || p->state == TASK_RUNNING) + fixup_cum_window_demand(rq, -(s64)p->ravg.demand); } static void @@ -95,6 +120,8 @@ fixup_cumulative_runnable_avg(struct rq *rq, if ((s64)rq->cumulative_runnable_avg < 0) panic("cra less than zero: tld: %lld, task_load(p) = %u\n", task_load_delta, task_load(p)); + + fixup_cum_window_demand(rq, task_load_delta); } u64 walt_ktime_clock(void) @@ -180,6 +207,8 @@ update_window_start(struct rq *rq, u64 wallclock) nr_windows = div64_u64(delta, walt_ravg_window); rq->window_start += (u64)nr_windows * (u64)walt_ravg_window; + + rq->cum_window_demand = rq->cumulative_runnable_avg; } /* @@ -568,10 +597,20 @@ static void update_history(struct rq *rq, struct task_struct *p, * A throttled deadline sched class task gets dequeued without * changing p->on_rq. Since the dequeue decrements hmp stats * avoid decrementing it here again. + * + * When window is rolled over, the cumulative window demand + * is reset to the cumulative runnable average (contribution from + * the tasks on the runqueue). If the current task is dequeued + * already, it's demand is not included in the cumulative runnable + * average. So add the task demand separately to cumulative window + * demand. */ - if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || - !p->dl.dl_throttled)) - fixup_cumulative_runnable_avg(rq, p, demand); + if (!task_has_dl_policy(p) || !p->dl.dl_throttled) { + if (task_on_rq_queued(p)) + fixup_cumulative_runnable_avg(rq, p, demand); + else if (rq->curr == p) + fixup_cum_window_demand(rq, demand); + } p->ravg.demand = demand; @@ -792,6 +831,17 @@ void walt_fixup_busy_time(struct task_struct *p, int new_cpu) walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0); + /* + * When a task is migrating during the wakeup, adjust + * the task's contribution towards cumulative window + * demand. + */ + if (p->state == TASK_WAKING && + p->last_sleep_ts >= src_rq->window_start) { + fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand); + fixup_cum_window_demand(dest_rq, p->ravg.demand); + } + if (p->ravg.curr_window) { src_rq->curr_runnable_sum -= p->ravg.curr_window; dest_rq->curr_runnable_sum += p->ravg.curr_window; -- cgit v1.2.3 From 38ddcff85af052ad99e4f3f0b6e9659b0ca10dcf Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 7 Aug 2017 15:46:13 +0100 Subject: FROMLIST: sched/fair: Use wake_q length as a hint for wake_wide (from https://patchwork.kernel.org/patch/9895261/) This patch adds a parameter to select_task_rq, sibling_count_hint allowing the caller, where it has this information, to inform the sched_class the number of tasks that are being woken up as part of the same event. The wake_q mechanism is one case where this information is available. select_task_rq_fair can then use the information to detect that it needs to widen the search space for task placement in order to avoid overloading the last-level cache domain's CPUs. * * * The reason I am investigating this change is the following use case on ARM big.LITTLE (asymmetrical CPU capacity): 1 task per CPU, which all repeatedly do X amount of work then pthread_barrier_wait (i.e. sleep until the last task finishes its X and hits the barrier). On big.LITTLE, the tasks which get a "big" CPU finish faster, and then those CPUs pull over the tasks that are still running: v CPU v ->time-> ------------- 0 (big) 11111 /333 ------------- 1 (big) 22222 /444| ------------- 2 (LITTLE) 333333/ ------------- 3 (LITTLE) 444444/ ------------- Now when task 4 hits the barrier (at |) and wakes the others up, there are 4 tasks with prev_cpu= and 0 tasks with prev_cpu=. want_affine therefore means that we'll only look in CPUs 0 and 1 (sd_llc), so tasks will be unnecessarily coscheduled on the bigs until the next load balance, something like this: v CPU v ->time-> ------------------------ 0 (big) 11111 /333 31313\33333 ------------------------ 1 (big) 22222 /444|424\4444444 ------------------------ 2 (LITTLE) 333333/ \222222 ------------------------ 3 (LITTLE) 444444/ \1111 ------------------------ ^^^ underutilization So, I'm trying to get want_affine = 0 for these tasks. I don't _think_ any incarnation of the wakee_flips mechanism can help us here because which task is waker and which tasks are wakees generally changes with each iteration. However pthread_barrier_wait (or more accurately FUTEX_WAKE) has the nice property that we know exactly how many tasks are being woken, so we can cheat. It might be a disadvantage that we "widen" _every_ task that's woken in an event, while select_idle_sibling would work fine for the first sd_llc_size - 1 tasks. IIUC, if wake_affine() behaves correctly this trick wouldn't be necessary on SMP systems, so it might be best guarded by the presence of SD_ASYM_CPUCAPACITY? * * * Final note.. In order to observe "perfect" behaviour for this use case, I also had to disable the TTWU_QUEUE sched feature. Suppose during the wakeup above we are working through the work queue and have placed tasks 3 and 2, and are about to place task 1: v CPU v ->time-> -------------- 0 (big) 11111 /333 3 -------------- 1 (big) 22222 /444|4 -------------- 2 (LITTLE) 333333/ 2 -------------- 3 (LITTLE) 444444/ <- Task 1 should go here -------------- If TTWU_QUEUE is enabled, we will not yet have enqueued task 2 (having instead sent a reschedule IPI) or attached its load to CPU 2. So we are likely to also place task 1 on cpu 2. Disabling TTWU_QUEUE means that we enqueue task 2 before placing task 1, solving this issue. TTWU_QUEUE is there to minimise rq lock contention, and I guess that this contention is less of an issue on big.LITTLE systems since they have relatively few CPUs, which suggests the trade-off makes sense here. Change-Id: I2080302839a263e0841a89efea8589ea53bbda9c Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Josef Bacik Cc: Joel Fernandes Cc: Mike Galbraith Cc: Matt Fleming --- include/linux/sched.h | 3 ++- kernel/sched/core.c | 35 +++++++++++++++++++++++------------ kernel/sched/deadline.c | 3 ++- kernel/sched/fair.c | 21 ++++++++++++++------- kernel/sched/idle_task.c | 3 ++- kernel/sched/rt.c | 3 ++- kernel/sched/sched.h | 3 ++- kernel/sched/stop_task.c | 3 ++- 8 files changed, 49 insertions(+), 25 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 62e179f84aef..60af9d2fa922 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -993,12 +993,13 @@ struct wake_q_node { struct wake_q_head { struct wake_q_node *first; struct wake_q_node **lastp; + int count; }; #define WAKE_Q_TAIL ((struct wake_q_node *) 0x01) #define WAKE_Q(name) \ - struct wake_q_head name = { WAKE_Q_TAIL, &name.first } + struct wake_q_head name = { WAKE_Q_TAIL, &name.first, 0 } extern void wake_q_add(struct wake_q_head *head, struct task_struct *task); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 563f316f5330..18d607f9a417 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -546,6 +546,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) return; + head->count++; + get_task_struct(task); /* @@ -555,6 +557,10 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) head->lastp = &node->next; } +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, + int sibling_count_hint); + void wake_up_q(struct wake_q_head *head) { struct wake_q_node *node = head->first; @@ -569,10 +575,10 @@ void wake_up_q(struct wake_q_head *head) task->wake_q.next = NULL; /* - * wake_up_process() implies a wmb() to pair with the queueing + * try_to_wake_up() implies a wmb() to pair with the queueing * in wake_q_add() so as not to miss wakeups. */ - wake_up_process(task); + try_to_wake_up(task, TASK_NORMAL, 0, head->count); put_task_struct(task); } } @@ -1642,12 +1648,14 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags, + int sibling_count_hint) { lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags, + sibling_count_hint); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1932,6 +1940,8 @@ static void ttwu_queue(struct task_struct *p, int cpu) * @p: the thread to be awakened * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) + * @sibling_count_hint: A hint at the number of threads that are being woken up + * in this event. * * Put it on the run-queue if it's not already there. The "current" * thread is always on the run-queue (except when the actual @@ -1943,7 +1953,8 @@ static void ttwu_queue(struct task_struct *p, int cpu) * or @state didn't match @p's state. */ static int -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, + int sibling_count_hint) { unsigned long flags; int cpu, success = 0; @@ -2044,8 +2055,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (p->sched_class->task_waking) p->sched_class->task_waking(p); - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); - + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags, + sibling_count_hint); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); @@ -2127,13 +2138,13 @@ out: */ int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_NORMAL, 0); + return try_to_wake_up(p, TASK_NORMAL, 0, 1); } EXPORT_SYMBOL(wake_up_process); int wake_up_state(struct task_struct *p, unsigned int state) { - return try_to_wake_up(p, state, 0); + return try_to_wake_up(p, state, 0, 1); } /* @@ -2467,7 +2478,7 @@ void wake_up_new_task(struct task_struct *p) * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, * as we're not fully set-up yet. */ - __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1)); #endif rq = __task_rq_lock(p); update_rq_clock(rq); @@ -2905,7 +2916,7 @@ void sched_exec(void) int dest_cpu; raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1); if (dest_cpu == smp_processor_id()) goto unlock; @@ -3560,7 +3571,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { - return try_to_wake_up(curr->private, mode, wake_flags); + return try_to_wake_up(curr->private, mode, wake_flags, 1); } EXPORT_SYMBOL(default_wake_function); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ab1a9a99660d..cdf2a0b97611 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1070,7 +1070,8 @@ static void yield_task_dl(struct rq *rq) static int find_later_rq(struct task_struct *task); static int -select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { struct task_struct *curr; struct rq *rq; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9bc717ef81f5..b904a023be95 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5773,15 +5773,18 @@ energy_diff(struct energy_env *eenv) * being client/server, worker/dispatcher, interrupt source or whatever is * irrelevant, spread criteria is apparent partner count exceeds socket size. */ -static int wake_wide(struct task_struct *p) +static int wake_wide(struct task_struct *p, int sibling_count_hint) { unsigned int master = current->wakee_flips; unsigned int slave = p->wakee_flips; - int factor = this_cpu_read(sd_llc_size); + int llc_size = this_cpu_read(sd_llc_size); + + if (sibling_count_hint >= llc_size) + return 1; if (master < slave) swap(master, slave); - if (slave < factor || master < slave * factor) + if (slave < llc_size || master < slave * llc_size) return 0; return 1; } @@ -6754,7 +6757,8 @@ unlock: * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags, + int sibling_count_hint) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); @@ -6762,9 +6766,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int want_affine = 0; int sync = wake_flags & WF_SYNC; - if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) - && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + if (sd_flag & SD_BALANCE_WAKE) { + record_wakee(p); + want_affine = !wake_wide(p, sibling_count_hint) && + !wake_cap(p, cpu, prev_cpu) && + cpumask_test_cpu(cpu, &p->cpus_allowed); + } if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized)) return select_energy_cpu_brute(p, prev_cpu, sync); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index c4ae0f1fdf9b..33d7003fa1b8 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -9,7 +9,8 @@ #ifdef CONFIG_SMP static int -select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3715473fd8f8..069f8982867f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1372,7 +1372,8 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { struct task_struct *curr; struct rq *rq; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index deba080af845..a2d5de8415e1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1261,7 +1261,8 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags, + int subling_count_hint); void (*migrate_task_rq)(struct task_struct *p); void (*task_waking) (struct task_struct *task); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 61f852d46858..a5567ccd8803 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -12,7 +12,8 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { return task_cpu(p); /* stop tasks as never migrate */ } -- cgit v1.2.3 From e79f447a9762f68d6ecf8371bcf3e970bceb662a Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Thu, 10 Aug 2017 17:26:20 -0700 Subject: sched: walt: Correct WALT window size initialization It is preferable that WALT window rollover occurs just before a tick, since the tick is an opportune moment to record a complete window's statistics, as well as report those stats to the cpu frequency governor. When CONFIG_HZ results in a TICK_NSEC that isn't a integral number, this requirement may be violated. Account for this by reducing the WALT window size to the nearest multiple of TICK_NSEC. Commit d368c6faa19b ("sched: walt: fix window misalignment when HZ=300") attempted to do this but WALT isn't using MIN_SCHED_RAVG_WINDOW as the window size and the patch was doing nothing. Also, change the type of 'walt_disabled' to bool and warn if an invalid window size causes WALT to be disabled. Change-Id: Ie3dcfc21a3df4408254ca1165a355bbe391ed5c7 Signed-off-by: Vikram Mulukutla --- include/trace/events/sched.h | 2 +- kernel/sched/sched.h | 2 +- kernel/sched/walt.c | 46 +++++++++++++++++++++++++++----------------- kernel/sched/walt.h | 2 +- 4 files changed, 31 insertions(+), 21 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index bf96bf05be82..9f7fd0c6662b 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -642,7 +642,7 @@ TRACE_EVENT(sched_contrib_scale_f, extern unsigned int sysctl_sched_use_walt_cpu_util; extern unsigned int sysctl_sched_use_walt_task_util; extern unsigned int walt_ravg_window; -extern unsigned int walt_disabled; +extern bool walt_disabled; #endif /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a2d5de8415e1..dd86072eaf4e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1560,7 +1560,7 @@ static inline unsigned long capacity_orig_of(int cpu) extern unsigned int sysctl_sched_use_walt_cpu_util; extern unsigned int walt_ravg_window; -extern unsigned int walt_disabled; +extern bool walt_disabled; /* * cpu_util returns the amount of capacity of a CPU that is used by CFS diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 93f61486f989..8d25ffbe4fed 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -41,25 +41,17 @@ static __read_mostly unsigned int walt_io_is_busy = 0; unsigned int sysctl_sched_walt_init_task_load_pct = 15; -/* 1 -> use PELT based load stats, 0 -> use window-based load stats */ -unsigned int __read_mostly walt_disabled = 0; +/* true -> use PELT based load stats, false -> use window-based load stats */ +bool __read_mostly walt_disabled = false; -/* Window size (in ns) */ -__read_mostly unsigned int walt_ravg_window = 20000000; - -/* Min window size (in ns) = 10ms */ -#ifdef CONFIG_HZ_300 /* - * Tick interval becomes to 3333333 due to - * rounding error when HZ=300. + * Window size (in ns). Adjust for the tick size so that the window + * rollover occurs just before the tick boundary. */ -#define MIN_SCHED_RAVG_WINDOW (3333333 * 6) -#else -#define MIN_SCHED_RAVG_WINDOW 10000000 -#endif - -/* Max window size (in ns) = 1s */ -#define MAX_SCHED_RAVG_WINDOW 1000000000 +__read_mostly unsigned int walt_ravg_window = + (20000000 / TICK_NSEC) * TICK_NSEC; +#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC) +#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC) static unsigned int sync_cpu; static ktime_t ktime_last; @@ -180,10 +172,28 @@ static int exiting_task(struct task_struct *p) static int __init set_walt_ravg_window(char *str) { + unsigned int adj_window; + bool no_walt = walt_disabled; + get_option(&str, &walt_ravg_window); - walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW || - walt_ravg_window > MAX_SCHED_RAVG_WINDOW); + /* Adjust for CONFIG_HZ */ + adj_window = (walt_ravg_window / TICK_NSEC) * TICK_NSEC; + + /* Warn if we're a bit too far away from the expected window size */ + WARN(adj_window < walt_ravg_window - NSEC_PER_MSEC, + "tick-adjusted window size %u, original was %u\n", adj_window, + walt_ravg_window); + + walt_ravg_window = adj_window; + + walt_disabled = walt_disabled || + (walt_ravg_window < MIN_SCHED_RAVG_WINDOW || + walt_ravg_window > MAX_SCHED_RAVG_WINDOW); + + WARN(!no_walt && walt_disabled, + "invalid window size, disabling WALT\n"); + return 0; } diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h index f56c4da16d0b..de7edac43674 100644 --- a/kernel/sched/walt.h +++ b/kernel/sched/walt.h @@ -59,6 +59,6 @@ static inline u64 walt_ktime_clock(void) { return 0; } #endif /* CONFIG_SCHED_WALT */ -extern unsigned int walt_disabled; +extern bool walt_disabled; #endif -- cgit v1.2.3 From a21299785a502ca4b3592a0f977aa1202b105260 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 22 Sep 2016 12:25:56 +0100 Subject: sched/core: Warn if ENERGY_AWARE is enabled but data is missing If the EAS energy model is missing or incomplete, i.e. sd_scs is NULL, then sched_group_energy will return -EINVAL on the assumption that it raced with a CPU hotplug event. In that case, energy_diff will return 0 and the energy-aware wake path will silently fail to trigger any migrations. This case can be triggered by disabling CONFIG_SCHED_MC on existing platforms, so that there are no sched_groups with the SD_SHARE_CAP_STATES flag, so that sd_scs is NULL. Add checks so that a warning is printed if EAS is ever enabled while the necessary data is not present. Change-Id: Id233a510b5ad8b7fcecac0b1d789e730bbfc7c4a Signed-off-by: Brendan Jackman --- kernel/sched/core.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 18d607f9a417..4f11b84eaf0a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -91,6 +91,8 @@ #include #include "walt.h" +static bool have_sched_energy_data(void); + DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -193,6 +195,10 @@ static int sched_feat_set(char *cmp) sysctl_sched_features &= ~(1UL << i); sched_feat_disable(i); } else { + if (i == __SCHED_FEAT_ENERGY_AWARE) + WARN(!have_sched_energy_data(), + "Missing sched energy data\n"); + sysctl_sched_features |= (1UL << i); sched_feat_enable(i); } @@ -6649,6 +6655,19 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } +static bool have_sched_energy_data(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (!rcu_dereference(per_cpu(sd_scs, cpu)) || + !rcu_dereference(per_cpu(sd_ea, cpu))) + return false; + } + + return true; +} + /* * Check that the per-cpu provided sd energy data is consistent for all cpus * within the mask. @@ -7461,6 +7480,9 @@ static int build_sched_domains(const struct cpumask *cpu_map, } rcu_read_unlock(); + WARN(sched_feat(ENERGY_AWARE) && !have_sched_energy_data(), + "Missing data for energy aware scheduling\n"); + ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); -- cgit v1.2.3 From a899b9085c8d5d581b214c24dc707466e8cb479f Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 27 Oct 2017 13:23:05 -0700 Subject: sched/core: fix have_sched_energy_data build warning have_sched_energy_data is defined only for CONFIG_SMP, so declare it only with CONFIG_SMP. Fixes warning from intel bot: tree: https://android.googlesource.com/kernel/msm android-4.4 head: a21299785a502ca4b3592a0f977aa1202b105260 commit: a21299785a502ca4b3592a0f977aa1202b105260 [5/5] sched/core: Warn if ENERGY_AWARE is enabled but data is missing config: i386-randconfig-x002-201743 (attached as .config) compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901 reproduce: git checkout a21299785a502ca4b3592a0f977aa1202b105260 # save the attached .config to linux build tree make ARCH=i386 All warnings (new ones prefixed by >>): >> kernel//sched/core.c:94:13: warning: 'have_sched_energy_data' used but never defined static bool have_sched_energy_data(void); ^~~~~~~~~~~~~~~~~~~~~~ vim +/have_sched_energy_data +94 kernel//sched/core.c 93 > 94 static bool have_sched_energy_data(void); 95 Change-Id: I266b63ece6fb31d2b5b11821a8244e147ba6d3a4 Signed-off-by: Joel Fernandes --- kernel/sched/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4f11b84eaf0a..4e5298a1977e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -91,7 +91,9 @@ #include #include "walt.h" +#ifdef CONFIG_SMP static bool have_sched_energy_data(void); +#endif DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -195,10 +197,11 @@ static int sched_feat_set(char *cmp) sysctl_sched_features &= ~(1UL << i); sched_feat_disable(i); } else { +#ifdef CONFIG_SMP if (i == __SCHED_FEAT_ENERGY_AWARE) WARN(!have_sched_energy_data(), "Missing sched energy data\n"); - +#endif sysctl_sched_features |= (1UL << i); sched_feat_enable(i); } -- cgit v1.2.3 From 8f012745e7f6c847ae0843e4ed171e372748b822 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 25 Oct 2017 09:29:46 -0700 Subject: BACKPORT: arm64: relocatable: suppress R_AARCH64_ABS64 relocations in vmlinux The linker routines that we rely on to produce a relocatable PIE binary treat it as a shared ELF object in some ways, i.e., it emits symbol based R_AARCH64_ABS64 relocations into the final binary since doing so would be appropriate when linking a shared library that is subject to symbol preemption. (This means that an executable can override certain symbols that are exported by a shared library it is linked with, and that the shared library *must* update all its internal references as well, and point them to the version provided by the executable.) Symbol preemption does not occur for OS hosted PIE executables, let alone for vmlinux, and so we would prefer to get rid of these symbol based relocations. This would allow us to simplify the relocation routines, and to strip the .dynsym, .dynstr and .hash sections from the binary. (Note that these are tiny, and are placed in the .init segment, but they clutter up the vmlinux binary.) Note that these R_AARCH64_ABS64 relocations are only emitted for absolute references to symbols defined in the linker script, all other relocatable quantities are covered by anonymous R_AARCH64_RELATIVE relocations that simply list the offsets to all 64-bit values in the binary that need to be fixed up based on the offset between the link time and run time addresses. Fortunately, GNU ld has a -Bsymbolic option, which is intended for shared libraries to allow them to ignore symbol preemption, and unconditionally bind all internal symbol references to its own definitions. So set it for our PIE binary as well, and get rid of the asoociated sections and the relocation code that processes them. Signed-off-by: Ard Biesheuvel [will: fixed conflict with __dynsym_offset linker script entry] Signed-off-by: Will Deacon Note: This backport only adds -Bsymbolic to LDFLAGS_vmlinux, but doesn't remove R_AARCH64_ABS64 relocation handling, because those changes depend on later refactoring of the code that we don't need in android-4.4. Bug: 66932127 Change-Id: I56f664e02bc8d2fa3e5f496fb041bc3a8e1a4094 (cherry picked from commit 08cc55b2afd97a654f71b3bebf8bb0ec89fdc498) Signed-off-by: Sami Tolvanen --- arch/arm64/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index c3bc092fc128..53c054491254 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -16,7 +16,7 @@ OBJCOPYFLAGS :=-O binary -R .note -R .note.gnu.build-id -R .comment -S GZFLAGS :=-9 ifneq ($(CONFIG_RELOCATABLE),) -LDFLAGS_vmlinux += -pie +LDFLAGS_vmlinux += -pie -Bsymbolic endif KBUILD_DEFCONFIG := defconfig -- cgit v1.2.3