From b178c94efdfd7e7c649277c0f570c5db14aaba4f Mon Sep 17 00:00:00 2001 From: Jan Luebbe Date: Mon, 28 Aug 2017 17:25:16 +0200 Subject: bus: mbus: fix window size calculation for 4GB windows commit 2bbbd96357ce76cc45ec722c00f654aa7b189112 upstream. At least the Armada XP SoC supports 4GB on a single DRAM window. Because the size register values contain the actual size - 1, the MSB is set in that case. For example, the SDRAM window's control register's value is 0xffffffe1 for 4GB (bits 31 to 24 contain the size). The MBUS driver reads back each window's size from registers and calculates the actual size as (control_reg | ~DDR_SIZE_MASK) + 1, which overflows for 32 bit values, resulting in other miscalculations further on (a bad RAM window for the CESA crypto engine calculated by mvebu_mbus_setup_cpu_target_nooverlap() in my case). This patch changes the type in 'struct mbus_dram_window' from u32 to u64, which allows us to keep using the same register calculation code in most MBUS-using drivers (which calculate ->size - 1 again). Fixes: fddddb52a6c4 ("bus: introduce an Marvell EBU MBus driver") Signed-off-by: Jan Luebbe Signed-off-by: Gregory CLEMENT Signed-off-by: Greg Kroah-Hartman --- include/linux/mbus.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mbus.h b/include/linux/mbus.h index 1f7bc630d225..71a5a56b0bba 100644 --- a/include/linux/mbus.h +++ b/include/linux/mbus.h @@ -29,8 +29,8 @@ struct mbus_dram_target_info struct mbus_dram_window { u8 cs_index; u8 mbus_attr; - u32 base; - u32 size; + u64 base; + u64 size; } cs[4]; }; -- cgit v1.2.3 From 8a004caec12bf241e567e3640401256cc9bc2e45 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 4 Oct 2017 16:43:25 +0100 Subject: KEYS: Fix race between updating and finding a negative key commit 363b02dab09b3226f3bd1420dad9c72b79a42a76 upstream. Consolidate KEY_FLAG_INSTANTIATED, KEY_FLAG_NEGATIVE and the rejection error into one field such that: (1) The instantiation state can be modified/read atomically. (2) The error can be accessed atomically with the state. (3) The error isn't stored unioned with the payload pointers. This deals with the problem that the state is spread over three different objects (two bits and a separate variable) and reading or updating them atomically isn't practical, given that not only can uninstantiated keys change into instantiated or rejected keys, but rejected keys can also turn into instantiated keys - and someone accessing the key might not be using any locking. The main side effect of this problem is that what was held in the payload may change, depending on the state. For instance, you might observe the key to be in the rejected state. You then read the cached error, but if the key semaphore wasn't locked, the key might've become instantiated between the two reads - and you might now have something in hand that isn't actually an error code. The state is now KEY_IS_UNINSTANTIATED, KEY_IS_POSITIVE or a negative error code if the key is negatively instantiated. The key_is_instantiated() function is replaced with key_is_positive() to avoid confusion as negative keys are also 'instantiated'. Additionally, barriering is included: (1) Order payload-set before state-set during instantiation. (2) Order state-read before payload-read when using the key. Further separate barriering is necessary if RCU is being used to access the payload content after reading the payload pointers. Fixes: 146aa8b1453b ("KEYS: Merge the type-specific data with the payload data") Reported-by: Eric Biggers Signed-off-by: David Howells Reviewed-by: Eric Biggers Signed-off-by: Greg Kroah-Hartman --- include/linux/key.h | 49 +++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/key.h b/include/linux/key.h index dcc115e8dd03..af071ca73079 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -126,6 +126,11 @@ static inline bool is_key_possessed(const key_ref_t key_ref) return (unsigned long) key_ref & 1UL; } +enum key_state { + KEY_IS_UNINSTANTIATED, + KEY_IS_POSITIVE, /* Positively instantiated */ +}; + /*****************************************************************************/ /* * authentication token / access credential / keyring @@ -157,6 +162,7 @@ struct key { * - may not match RCU dereferenced payload * - payload should contain own length */ + short state; /* Key state (+) or rejection error (-) */ #ifdef KEY_DEBUGGING unsigned magic; @@ -165,19 +171,17 @@ struct key { #endif unsigned long flags; /* status flags (change with bitops) */ -#define KEY_FLAG_INSTANTIATED 0 /* set if key has been instantiated */ -#define KEY_FLAG_DEAD 1 /* set if key type has been deleted */ -#define KEY_FLAG_REVOKED 2 /* set if key had been revoked */ -#define KEY_FLAG_IN_QUOTA 3 /* set if key consumes quota */ -#define KEY_FLAG_USER_CONSTRUCT 4 /* set if key is being constructed in userspace */ -#define KEY_FLAG_NEGATIVE 5 /* set if key is negative */ -#define KEY_FLAG_ROOT_CAN_CLEAR 6 /* set if key can be cleared by root without permission */ -#define KEY_FLAG_INVALIDATED 7 /* set if key has been invalidated */ -#define KEY_FLAG_TRUSTED 8 /* set if key is trusted */ -#define KEY_FLAG_TRUSTED_ONLY 9 /* set if keyring only accepts links to trusted keys */ -#define KEY_FLAG_BUILTIN 10 /* set if key is builtin */ -#define KEY_FLAG_ROOT_CAN_INVAL 11 /* set if key can be invalidated by root without permission */ -#define KEY_FLAG_UID_KEYRING 12 /* set if key is a user or user session keyring */ +#define KEY_FLAG_DEAD 0 /* set if key type has been deleted */ +#define KEY_FLAG_REVOKED 1 /* set if key had been revoked */ +#define KEY_FLAG_IN_QUOTA 2 /* set if key consumes quota */ +#define KEY_FLAG_USER_CONSTRUCT 3 /* set if key is being constructed in userspace */ +#define KEY_FLAG_ROOT_CAN_CLEAR 4 /* set if key can be cleared by root without permission */ +#define KEY_FLAG_INVALIDATED 5 /* set if key has been invalidated */ +#define KEY_FLAG_TRUSTED 6 /* set if key is trusted */ +#define KEY_FLAG_TRUSTED_ONLY 7 /* set if keyring only accepts links to trusted keys */ +#define KEY_FLAG_BUILTIN 8 /* set if key is builtin */ +#define KEY_FLAG_ROOT_CAN_INVAL 9 /* set if key can be invalidated by root without permission */ +#define KEY_FLAG_UID_KEYRING 10 /* set if key is a user or user session keyring */ /* the key type and key description string * - the desc is used to match a key against search criteria @@ -203,7 +207,6 @@ struct key { struct list_head name_link; struct assoc_array keys; }; - int reject_error; }; }; @@ -319,17 +322,27 @@ extern void key_set_timeout(struct key *, unsigned); #define KEY_NEED_SETATTR 0x20 /* Require permission to change attributes */ #define KEY_NEED_ALL 0x3f /* All the above permissions */ +static inline short key_read_state(const struct key *key) +{ + /* Barrier versus mark_key_instantiated(). */ + return smp_load_acquire(&key->state); +} + /** - * key_is_instantiated - Determine if a key has been positively instantiated + * key_is_positive - Determine if a key has been positively instantiated * @key: The key to check. * * Return true if the specified key has been positively instantiated, false * otherwise. */ -static inline bool key_is_instantiated(const struct key *key) +static inline bool key_is_positive(const struct key *key) +{ + return key_read_state(key) == KEY_IS_POSITIVE; +} + +static inline bool key_is_negative(const struct key *key) { - return test_bit(KEY_FLAG_INSTANTIATED, &key->flags) && - !test_bit(KEY_FLAG_NEGATIVE, &key->flags); + return key_read_state(key) < 0; } #define rcu_dereference_key(KEY) \ -- cgit v1.2.3 From 97cb74f48599ca1ae6c17955882f167a4c3aaad2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jun 2016 13:29:28 +0200 Subject: BACKPORT: sched/fair: Fix PELT integrity for new tasks Vincent and Yuyang found another few scenarios in which entity tracking goes wobbly. The scenarios are basically due to the fact that new tasks are not immediately attached and thereby differ from the normal situation -- a task is always attached to a cfs_rq load average (such that it includes its blocked contribution) and are explicitly detached/attached on migration to another cfs_rq. Scenario 1: switch to fair class p->sched_class = fair_class; if (queued) enqueue_task(p); ... enqueue_entity() enqueue_entity_load_avg() migrated = !sa->last_update_time (true) if (migrated) attach_entity_load_avg() check_class_changed() switched_from() (!fair) switched_to() (fair) switched_to_fair() attach_entity_load_avg() If @p is a new task that hasn't been fair before, it will have !last_update_time and, per the above, end up in attach_entity_load_avg() _twice_. Scenario 2: change between cgroups sched_move_group(p) if (queued) dequeue_task() task_move_group_fair() detach_task_cfs_rq() detach_entity_load_avg() set_task_rq() attach_task_cfs_rq() attach_entity_load_avg() if (queued) enqueue_task(); ... enqueue_entity() enqueue_entity_load_avg() migrated = !sa->last_update_time (true) if (migrated) attach_entity_load_avg() Similar as with scenario 1, if @p is a new task, it will have !load_update_time and we'll end up in attach_entity_load_avg() _twice_. Furthermore, notice how we do a detach_entity_load_avg() on something that wasn't attached to begin with. As stated above; the problem is that the new task isn't yet attached to the load tracking and thereby violates the invariant assumption. This patch remedies this by ensuring a new task is indeed properly attached to the load tracking on creation, through post_init_entity_util_avg(). Of course, this isn't entirely as straightforward as one might think, since the task is hashed before we call wake_up_new_task() and thus can be poked at. We avoid this by adding TASK_NEW and teaching cpu_cgroup_can_attach() to refuse such tasks. .:: BACKPORT Complicated by the fact that mch of the lines changed by the original of this commit were then changed by: df217913e72e sched/fair: Factorize attach/detach entity and then d31b1a66cbe0 sched/fair: Factorize PELT update , which have both already been backported here. Reported-by: Yuyang Du Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar (cherry picked from commit 7dc603c9028ea5d4354e0e317e8481df99b06d7e) Change-Id: Ibc59eb52310a62709d49a744bd5a24e8b97c4ae8 Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath --- include/linux/sched.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index d0d0b1b45418..1872d19d1702 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -222,9 +222,10 @@ extern void proc_sched_set_task(struct task_struct *p); #define TASK_WAKING 256 #define TASK_PARKED 512 #define TASK_NOLOAD 1024 -#define TASK_STATE_MAX 2048 +#define TASK_NEW 2048 +#define TASK_STATE_MAX 4096 -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN" +#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; -- cgit v1.2.3 From 43bd960dfe728284e1059f11d6c686d23887c1c6 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Fri, 3 Feb 2017 11:15:31 -0800 Subject: sched: WALT: account cumulative window demand Energy cost estimation has been a long lasting challenge for WALT because WALT guides CPU frequency based on the CPU utilization of previous window. Consequently it's not possible to know newly waking-up task's energy cost until WALT's end of the current window. The WALT already tracks 'Previous Runnable Sum' (prev_runnable_sum) and 'Cumulative Runnable Average' (cr_avg). They are designed for CPU frequency guidance and task placement but unfortunately both are not suitable for the energy cost estimation. It's because using prev_runnable_sum for energy cost calculation would make us to account CPU and task's energy solely based on activity in the previous window so for example, any task didn't have an activity in the previous window will be accounted as a 'zero energy cost' task. Energy estimation with cr_avg is what energy_diff() relies on at present. However cr_avg can only represent instantaneous picture of energy cost thus for example, if a CPU was fully occupied for an entire WALT window and became idle just before window boundary, and if there is a wake-up, energy_diff() accounts that CPU is a 'zero energy cost' CPU. As a result, introduce a new accounting unit 'Cumulative Window Demand'. The cumulative window demand tracks all the tasks' demands have seen in current window which is neither instantaneous nor actual execution time. Because task demand represents estimated scaled execution time when the task runs a full window, accumulation of all the demands represents predicted CPU load at the end of window. Thus we can estimate CPU's frequency at the end of current WALT window with the cumulative window demand. The use of prev_runnable_sum for the CPU frequency guidance and cr_avg for the task placement have not changed and these are going to be used for both purpose while this patch aims to add an additional statistics. Change-Id: I9908c77ead9973a26dea2b36c001c2baf944d4f5 Signed-off-by: Joonwoo Park --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1872d19d1702..62e179f84aef 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1568,6 +1568,7 @@ struct task_struct { * of this task */ u32 init_load_pct; + u64 last_sleep_ts; #endif #ifdef CONFIG_CGROUP_SCHED -- cgit v1.2.3 From 38ddcff85af052ad99e4f3f0b6e9659b0ca10dcf Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 7 Aug 2017 15:46:13 +0100 Subject: FROMLIST: sched/fair: Use wake_q length as a hint for wake_wide (from https://patchwork.kernel.org/patch/9895261/) This patch adds a parameter to select_task_rq, sibling_count_hint allowing the caller, where it has this information, to inform the sched_class the number of tasks that are being woken up as part of the same event. The wake_q mechanism is one case where this information is available. select_task_rq_fair can then use the information to detect that it needs to widen the search space for task placement in order to avoid overloading the last-level cache domain's CPUs. * * * The reason I am investigating this change is the following use case on ARM big.LITTLE (asymmetrical CPU capacity): 1 task per CPU, which all repeatedly do X amount of work then pthread_barrier_wait (i.e. sleep until the last task finishes its X and hits the barrier). On big.LITTLE, the tasks which get a "big" CPU finish faster, and then those CPUs pull over the tasks that are still running: v CPU v ->time-> ------------- 0 (big) 11111 /333 ------------- 1 (big) 22222 /444| ------------- 2 (LITTLE) 333333/ ------------- 3 (LITTLE) 444444/ ------------- Now when task 4 hits the barrier (at |) and wakes the others up, there are 4 tasks with prev_cpu= and 0 tasks with prev_cpu=. want_affine therefore means that we'll only look in CPUs 0 and 1 (sd_llc), so tasks will be unnecessarily coscheduled on the bigs until the next load balance, something like this: v CPU v ->time-> ------------------------ 0 (big) 11111 /333 31313\33333 ------------------------ 1 (big) 22222 /444|424\4444444 ------------------------ 2 (LITTLE) 333333/ \222222 ------------------------ 3 (LITTLE) 444444/ \1111 ------------------------ ^^^ underutilization So, I'm trying to get want_affine = 0 for these tasks. I don't _think_ any incarnation of the wakee_flips mechanism can help us here because which task is waker and which tasks are wakees generally changes with each iteration. However pthread_barrier_wait (or more accurately FUTEX_WAKE) has the nice property that we know exactly how many tasks are being woken, so we can cheat. It might be a disadvantage that we "widen" _every_ task that's woken in an event, while select_idle_sibling would work fine for the first sd_llc_size - 1 tasks. IIUC, if wake_affine() behaves correctly this trick wouldn't be necessary on SMP systems, so it might be best guarded by the presence of SD_ASYM_CPUCAPACITY? * * * Final note.. In order to observe "perfect" behaviour for this use case, I also had to disable the TTWU_QUEUE sched feature. Suppose during the wakeup above we are working through the work queue and have placed tasks 3 and 2, and are about to place task 1: v CPU v ->time-> -------------- 0 (big) 11111 /333 3 -------------- 1 (big) 22222 /444|4 -------------- 2 (LITTLE) 333333/ 2 -------------- 3 (LITTLE) 444444/ <- Task 1 should go here -------------- If TTWU_QUEUE is enabled, we will not yet have enqueued task 2 (having instead sent a reschedule IPI) or attached its load to CPU 2. So we are likely to also place task 1 on cpu 2. Disabling TTWU_QUEUE means that we enqueue task 2 before placing task 1, solving this issue. TTWU_QUEUE is there to minimise rq lock contention, and I guess that this contention is less of an issue on big.LITTLE systems since they have relatively few CPUs, which suggests the trade-off makes sense here. Change-Id: I2080302839a263e0841a89efea8589ea53bbda9c Signed-off-by: Brendan Jackman Signed-off-by: Chris Redpath Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Josef Bacik Cc: Joel Fernandes Cc: Mike Galbraith Cc: Matt Fleming --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 62e179f84aef..60af9d2fa922 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -993,12 +993,13 @@ struct wake_q_node { struct wake_q_head { struct wake_q_node *first; struct wake_q_node **lastp; + int count; }; #define WAKE_Q_TAIL ((struct wake_q_node *) 0x01) #define WAKE_Q(name) \ - struct wake_q_head name = { WAKE_Q_TAIL, &name.first } + struct wake_q_head name = { WAKE_Q_TAIL, &name.first, 0 } extern void wake_q_add(struct wake_q_head *head, struct task_struct *task); -- cgit v1.2.3 From e79f447a9762f68d6ecf8371bcf3e970bceb662a Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Thu, 10 Aug 2017 17:26:20 -0700 Subject: sched: walt: Correct WALT window size initialization It is preferable that WALT window rollover occurs just before a tick, since the tick is an opportune moment to record a complete window's statistics, as well as report those stats to the cpu frequency governor. When CONFIG_HZ results in a TICK_NSEC that isn't a integral number, this requirement may be violated. Account for this by reducing the WALT window size to the nearest multiple of TICK_NSEC. Commit d368c6faa19b ("sched: walt: fix window misalignment when HZ=300") attempted to do this but WALT isn't using MIN_SCHED_RAVG_WINDOW as the window size and the patch was doing nothing. Also, change the type of 'walt_disabled' to bool and warn if an invalid window size causes WALT to be disabled. Change-Id: Ie3dcfc21a3df4408254ca1165a355bbe391ed5c7 Signed-off-by: Vikram Mulukutla --- include/trace/events/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index bf96bf05be82..9f7fd0c6662b 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -642,7 +642,7 @@ TRACE_EVENT(sched_contrib_scale_f, extern unsigned int sysctl_sched_use_walt_cpu_util; extern unsigned int sysctl_sched_use_walt_task_util; extern unsigned int walt_ravg_window; -extern unsigned int walt_disabled; +extern bool walt_disabled; #endif /* -- cgit v1.2.3