From b178c94efdfd7e7c649277c0f570c5db14aaba4f Mon Sep 17 00:00:00 2001
From: Jan Luebbe <jlu@pengutronix.de>
Date: Mon, 28 Aug 2017 17:25:16 +0200
Subject: bus: mbus: fix window size calculation for 4GB windows

commit 2bbbd96357ce76cc45ec722c00f654aa7b189112 upstream.

At least the Armada XP SoC supports 4GB on a single DRAM window. Because
the size register values contain the actual size - 1, the MSB is set in
that case. For example, the SDRAM window's control register's value is
0xffffffe1 for 4GB (bits 31 to 24 contain the size).

The MBUS driver reads back each window's size from registers and
calculates the actual size as (control_reg | ~DDR_SIZE_MASK) + 1, which
overflows for 32 bit values, resulting in other miscalculations further
on (a bad RAM window for the CESA crypto engine calculated by
mvebu_mbus_setup_cpu_target_nooverlap() in my case).

This patch changes the type in 'struct mbus_dram_window' from u32 to
u64, which allows us to keep using the same register calculation code in
most MBUS-using drivers (which calculate ->size - 1 again).

Fixes: fddddb52a6c4 ("bus: introduce an Marvell EBU MBus driver")
Signed-off-by: Jan Luebbe <jlu@pengutronix.de>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/mbus.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mbus.h b/include/linux/mbus.h
index 1f7bc630d225..71a5a56b0bba 100644
--- a/include/linux/mbus.h
+++ b/include/linux/mbus.h
@@ -29,8 +29,8 @@ struct mbus_dram_target_info
 	struct mbus_dram_window {
 		u8	cs_index;
 		u8	mbus_attr;
-		u32	base;
-		u32	size;
+		u64	base;
+		u64	size;
 	} cs[4];
 };
 
-- 
cgit v1.2.3


From 8a004caec12bf241e567e3640401256cc9bc2e45 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 4 Oct 2017 16:43:25 +0100
Subject: KEYS: Fix race between updating and finding a negative key

commit 363b02dab09b3226f3bd1420dad9c72b79a42a76 upstream.

Consolidate KEY_FLAG_INSTANTIATED, KEY_FLAG_NEGATIVE and the rejection
error into one field such that:

 (1) The instantiation state can be modified/read atomically.

 (2) The error can be accessed atomically with the state.

 (3) The error isn't stored unioned with the payload pointers.

This deals with the problem that the state is spread over three different
objects (two bits and a separate variable) and reading or updating them
atomically isn't practical, given that not only can uninstantiated keys
change into instantiated or rejected keys, but rejected keys can also turn
into instantiated keys - and someone accessing the key might not be using
any locking.

The main side effect of this problem is that what was held in the payload
may change, depending on the state.  For instance, you might observe the
key to be in the rejected state.  You then read the cached error, but if
the key semaphore wasn't locked, the key might've become instantiated
between the two reads - and you might now have something in hand that isn't
actually an error code.

The state is now KEY_IS_UNINSTANTIATED, KEY_IS_POSITIVE or a negative error
code if the key is negatively instantiated.  The key_is_instantiated()
function is replaced with key_is_positive() to avoid confusion as negative
keys are also 'instantiated'.

Additionally, barriering is included:

 (1) Order payload-set before state-set during instantiation.

 (2) Order state-read before payload-read when using the key.

Further separate barriering is necessary if RCU is being used to access the
payload content after reading the payload pointers.

Fixes: 146aa8b1453b ("KEYS: Merge the type-specific data with the payload data")
Reported-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/key.h | 49 +++++++++++++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/key.h b/include/linux/key.h
index dcc115e8dd03..af071ca73079 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -126,6 +126,11 @@ static inline bool is_key_possessed(const key_ref_t key_ref)
 	return (unsigned long) key_ref & 1UL;
 }
 
+enum key_state {
+	KEY_IS_UNINSTANTIATED,
+	KEY_IS_POSITIVE,		/* Positively instantiated */
+};
+
 /*****************************************************************************/
 /*
  * authentication token / access credential / keyring
@@ -157,6 +162,7 @@ struct key {
 						 * - may not match RCU dereferenced payload
 						 * - payload should contain own length
 						 */
+	short			state;		/* Key state (+) or rejection error (-) */
 
 #ifdef KEY_DEBUGGING
 	unsigned		magic;
@@ -165,19 +171,17 @@ struct key {
 #endif
 
 	unsigned long		flags;		/* status flags (change with bitops) */
-#define KEY_FLAG_INSTANTIATED	0	/* set if key has been instantiated */
-#define KEY_FLAG_DEAD		1	/* set if key type has been deleted */
-#define KEY_FLAG_REVOKED	2	/* set if key had been revoked */
-#define KEY_FLAG_IN_QUOTA	3	/* set if key consumes quota */
-#define KEY_FLAG_USER_CONSTRUCT	4	/* set if key is being constructed in userspace */
-#define KEY_FLAG_NEGATIVE	5	/* set if key is negative */
-#define KEY_FLAG_ROOT_CAN_CLEAR	6	/* set if key can be cleared by root without permission */
-#define KEY_FLAG_INVALIDATED	7	/* set if key has been invalidated */
-#define KEY_FLAG_TRUSTED	8	/* set if key is trusted */
-#define KEY_FLAG_TRUSTED_ONLY	9	/* set if keyring only accepts links to trusted keys */
-#define KEY_FLAG_BUILTIN	10	/* set if key is builtin */
-#define KEY_FLAG_ROOT_CAN_INVAL	11	/* set if key can be invalidated by root without permission */
-#define KEY_FLAG_UID_KEYRING	12	/* set if key is a user or user session keyring */
+#define KEY_FLAG_DEAD		0	/* set if key type has been deleted */
+#define KEY_FLAG_REVOKED	1	/* set if key had been revoked */
+#define KEY_FLAG_IN_QUOTA	2	/* set if key consumes quota */
+#define KEY_FLAG_USER_CONSTRUCT	3	/* set if key is being constructed in userspace */
+#define KEY_FLAG_ROOT_CAN_CLEAR	4	/* set if key can be cleared by root without permission */
+#define KEY_FLAG_INVALIDATED	5	/* set if key has been invalidated */
+#define KEY_FLAG_TRUSTED	6	/* set if key is trusted */
+#define KEY_FLAG_TRUSTED_ONLY	7	/* set if keyring only accepts links to trusted keys */
+#define KEY_FLAG_BUILTIN	8	/* set if key is builtin */
+#define KEY_FLAG_ROOT_CAN_INVAL	9	/* set if key can be invalidated by root without permission */
+#define KEY_FLAG_UID_KEYRING	10	/* set if key is a user or user session keyring */
 
 	/* the key type and key description string
 	 * - the desc is used to match a key against search criteria
@@ -203,7 +207,6 @@ struct key {
 			struct list_head name_link;
 			struct assoc_array keys;
 		};
-		int reject_error;
 	};
 };
 
@@ -319,17 +322,27 @@ extern void key_set_timeout(struct key *, unsigned);
 #define	KEY_NEED_SETATTR 0x20	/* Require permission to change attributes */
 #define	KEY_NEED_ALL	0x3f	/* All the above permissions */
 
+static inline short key_read_state(const struct key *key)
+{
+	/* Barrier versus mark_key_instantiated(). */
+	return smp_load_acquire(&key->state);
+}
+
 /**
- * key_is_instantiated - Determine if a key has been positively instantiated
+ * key_is_positive - Determine if a key has been positively instantiated
  * @key: The key to check.
  *
  * Return true if the specified key has been positively instantiated, false
  * otherwise.
  */
-static inline bool key_is_instantiated(const struct key *key)
+static inline bool key_is_positive(const struct key *key)
+{
+	return key_read_state(key) == KEY_IS_POSITIVE;
+}
+
+static inline bool key_is_negative(const struct key *key)
 {
-	return test_bit(KEY_FLAG_INSTANTIATED, &key->flags) &&
-		!test_bit(KEY_FLAG_NEGATIVE, &key->flags);
+	return key_read_state(key) < 0;
 }
 
 #define rcu_dereference_key(KEY)					\
-- 
cgit v1.2.3


From 97cb74f48599ca1ae6c17955882f167a4c3aaad2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 16 Jun 2016 13:29:28 +0200
Subject: BACKPORT: sched/fair: Fix PELT integrity for new tasks

Vincent and Yuyang found another few scenarios in which entity
tracking goes wobbly.

The scenarios are basically due to the fact that new tasks are not
immediately attached and thereby differ from the normal situation -- a
task is always attached to a cfs_rq load average (such that it
includes its blocked contribution) and are explicitly
detached/attached on migration to another cfs_rq.

Scenario 1: switch to fair class

  p->sched_class = fair_class;
  if (queued)
    enqueue_task(p);
      ...
        enqueue_entity()
	  enqueue_entity_load_avg()
	    migrated = !sa->last_update_time (true)
	    if (migrated)
	      attach_entity_load_avg()
  check_class_changed()
    switched_from() (!fair)
    switched_to()   (fair)
      switched_to_fair()
        attach_entity_load_avg()

If @p is a new task that hasn't been fair before, it will have
!last_update_time and, per the above, end up in
attach_entity_load_avg() _twice_.

Scenario 2: change between cgroups

  sched_move_group(p)
    if (queued)
      dequeue_task()
    task_move_group_fair()
      detach_task_cfs_rq()
        detach_entity_load_avg()
      set_task_rq()
      attach_task_cfs_rq()
        attach_entity_load_avg()
    if (queued)
      enqueue_task();
        ...
          enqueue_entity()
	    enqueue_entity_load_avg()
	      migrated = !sa->last_update_time (true)
	      if (migrated)
	        attach_entity_load_avg()

Similar as with scenario 1, if @p is a new task, it will have
!load_update_time and we'll end up in attach_entity_load_avg()
_twice_.

Furthermore, notice how we do a detach_entity_load_avg() on something
that wasn't attached to begin with.

As stated above; the problem is that the new task isn't yet attached
to the load tracking and thereby violates the invariant assumption.

This patch remedies this by ensuring a new task is indeed properly
attached to the load tracking on creation, through
post_init_entity_util_avg().

Of course, this isn't entirely as straightforward as one might think,
since the task is hashed before we call wake_up_new_task() and thus
can be poked at. We avoid this by adding TASK_NEW and teaching
cpu_cgroup_can_attach() to refuse such tasks.

.:: BACKPORT

Complicated by the fact that mch of the lines changed by the original
of this commit were then changed by:

df217913e72e sched/fair: Factorize attach/detach entity <Vincent Guittot>

and then

d31b1a66cbe0 sched/fair: Factorize PELT update <Vincent Guittot>

, which have both already been backported here.

Reported-by: Yuyang Du <yuyang.du@intel.com>
Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
(cherry picked from commit 7dc603c9028ea5d4354e0e317e8481df99b06d7e)
Change-Id: Ibc59eb52310a62709d49a744bd5a24e8b97c4ae8
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 include/linux/sched.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d0d0b1b45418..1872d19d1702 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -222,9 +222,10 @@ extern void proc_sched_set_task(struct task_struct *p);
 #define TASK_WAKING		256
 #define TASK_PARKED		512
 #define TASK_NOLOAD		1024
-#define TASK_STATE_MAX		2048
+#define TASK_NEW		2048
+#define TASK_STATE_MAX		4096
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
+#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
 
 extern char ___assert_task_state[1 - 2*!!(
 		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
-- 
cgit v1.2.3


From 43bd960dfe728284e1059f11d6c686d23887c1c6 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Fri, 3 Feb 2017 11:15:31 -0800
Subject: sched: WALT: account cumulative window demand

Energy cost estimation has been a long lasting challenge for WALT
because WALT guides CPU frequency based on the CPU utilization of
previous window.  Consequently it's not possible to know newly
waking-up task's energy cost until WALT's end of the current window.

The WALT already tracks 'Previous Runnable Sum' (prev_runnable_sum)
and 'Cumulative Runnable Average' (cr_avg).  They are designed for
CPU frequency guidance and task placement but unfortunately both
are not suitable for the energy cost estimation.

It's because using prev_runnable_sum for energy cost calculation would
make us to account CPU and task's energy solely based on activity in the
previous window so for example, any task didn't have an activity in the
previous window will be accounted as a 'zero energy cost' task.
Energy estimation with cr_avg is what energy_diff() relies on at present.
However cr_avg can only represent instantaneous picture of energy cost
thus for example, if a CPU was fully occupied for an entire WALT window
and became idle just before window boundary, and if there is a wake-up,
energy_diff() accounts that CPU is a 'zero energy cost' CPU.

As a result, introduce a new accounting unit 'Cumulative Window Demand'.
The cumulative window demand tracks all the tasks' demands have seen in
current window which is neither instantaneous nor actual execution time.
Because task demand represents estimated scaled execution time when the
task runs a full window, accumulation of all the demands represents
predicted CPU load at the end of window.

Thus we can estimate CPU's frequency at the end of current WALT window
with the cumulative window demand.

The use of prev_runnable_sum for the CPU frequency guidance and cr_avg
for the task placement have not changed and these are going to be used
for both purpose while this patch aims to add an additional statistics.

Change-Id: I9908c77ead9973a26dea2b36c001c2baf944d4f5
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 include/linux/sched.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1872d19d1702..62e179f84aef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1568,6 +1568,7 @@ struct task_struct {
 	 * of this task
 	 */
 	u32 init_load_pct;
+	u64 last_sleep_ts;
 #endif
 
 #ifdef CONFIG_CGROUP_SCHED
-- 
cgit v1.2.3


From 38ddcff85af052ad99e4f3f0b6e9659b0ca10dcf Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Mon, 7 Aug 2017 15:46:13 +0100
Subject: FROMLIST: sched/fair: Use wake_q length as a hint for wake_wide

(from https://patchwork.kernel.org/patch/9895261/)

This patch adds a parameter to select_task_rq, sibling_count_hint
allowing the caller, where it has this information, to inform the
sched_class the number of tasks that are being woken up as part of
the same event.

The wake_q mechanism is one case where this information is available.

select_task_rq_fair can then use the information to detect that it
needs to widen the search space for task placement in order to avoid
overloading the last-level cache domain's CPUs.

                               * * *

The reason I am investigating this change is the following use case
on ARM big.LITTLE (asymmetrical CPU capacity): 1 task per CPU, which
all repeatedly do X amount of work then
pthread_barrier_wait (i.e. sleep until the last task finishes its X
and hits the barrier). On big.LITTLE, the tasks which get a "big" CPU
finish faster, and then those CPUs pull over the tasks that are still
running:

     v CPU v           ->time->

                    -------------
   0  (big)         11111  /333
                    -------------
   1  (big)         22222   /444|
                    -------------
   2  (LITTLE)      333333/
                    -------------
   3  (LITTLE)      444444/
                    -------------

Now when task 4 hits the barrier (at |) and wakes the others up,
there are 4 tasks with prev_cpu=<big> and 0 tasks with
prev_cpu=<little>. want_affine therefore means that we'll only look
in CPUs 0 and 1 (sd_llc), so tasks will be unnecessarily coscheduled
on the bigs until the next load balance, something like this:

     v CPU v           ->time->

                    ------------------------
   0  (big)         11111  /333  31313\33333
                    ------------------------
   1  (big)         22222   /444|424\4444444
                    ------------------------
   2  (LITTLE)      333333/          \222222
                    ------------------------
   3  (LITTLE)      444444/            \1111
                    ------------------------
                                 ^^^
                           underutilization

So, I'm trying to get want_affine = 0 for these tasks.

I don't _think_ any incarnation of the wakee_flips mechanism can help
us here because which task is waker and which tasks are wakees
generally changes with each iteration.

However pthread_barrier_wait (or more accurately FUTEX_WAKE) has the
nice property that we know exactly how many tasks are being woken, so
we can cheat.

It might be a disadvantage that we "widen" _every_ task that's woken in
an event, while select_idle_sibling would work fine for the first
sd_llc_size - 1 tasks.

IIUC, if wake_affine() behaves correctly this trick wouldn't be
necessary on SMP systems, so it might be best guarded by the presence
of SD_ASYM_CPUCAPACITY?

                               * * *

Final note..

In order to observe "perfect" behaviour for this use case, I also had
to disable the TTWU_QUEUE sched feature. Suppose during the wakeup
above we are working through the work queue and have placed tasks 3
and 2, and are about to place task 1:

     v CPU v           ->time->

                    --------------
   0  (big)         11111  /333  3
                    --------------
   1  (big)         22222   /444|4
                    --------------
   2  (LITTLE)      333333/      2
                    --------------
   3  (LITTLE)      444444/          <- Task 1 should go here
                    --------------

If TTWU_QUEUE is enabled, we will not yet have enqueued task
2 (having instead sent a reschedule IPI) or attached its load to CPU
2. So we are likely to also place task 1 on cpu 2. Disabling
TTWU_QUEUE means that we enqueue task 2 before placing task 1,
solving this issue. TTWU_QUEUE is there to minimise rq lock
contention, and I guess that this contention is less of an issue on
big.LITTLE systems since they have relatively few CPUs, which
suggests the trade-off makes sense here.

Change-Id: I2080302839a263e0841a89efea8589ea53bbda9c
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
---
 include/linux/sched.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 62e179f84aef..60af9d2fa922 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -993,12 +993,13 @@ struct wake_q_node {
 struct wake_q_head {
 	struct wake_q_node *first;
 	struct wake_q_node **lastp;
+	int count;
 };
 
 #define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
 
 #define WAKE_Q(name)					\
-	struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
+	struct wake_q_head name = { WAKE_Q_TAIL, &name.first, 0 }
 
 extern void wake_q_add(struct wake_q_head *head,
 		       struct task_struct *task);
-- 
cgit v1.2.3


From e79f447a9762f68d6ecf8371bcf3e970bceb662a Mon Sep 17 00:00:00 2001
From: Vikram Mulukutla <markivx@codeaurora.org>
Date: Thu, 10 Aug 2017 17:26:20 -0700
Subject: sched: walt: Correct WALT window size initialization

It is preferable that WALT window rollover occurs just
before a tick, since the tick is an opportune moment
to record a complete window's statistics, as well as report
those stats to the cpu frequency governor. When CONFIG_HZ
results in a TICK_NSEC that isn't a integral number, this
requirement may be violated. Account for this by reducing
the WALT window size to the nearest multiple of TICK_NSEC.

Commit d368c6faa19b ("sched: walt: fix window misalignment
when HZ=300") attempted to do this but WALT isn't using
MIN_SCHED_RAVG_WINDOW as the window size and the patch was
doing nothing.

Also, change the type of 'walt_disabled' to bool and warn
if an invalid window size causes WALT to be disabled.

Change-Id: Ie3dcfc21a3df4408254ca1165a355bbe391ed5c7
Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
---
 include/trace/events/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index bf96bf05be82..9f7fd0c6662b 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -642,7 +642,7 @@ TRACE_EVENT(sched_contrib_scale_f,
 extern unsigned int sysctl_sched_use_walt_cpu_util;
 extern unsigned int sysctl_sched_use_walt_task_util;
 extern unsigned int walt_ravg_window;
-extern unsigned int walt_disabled;
+extern bool walt_disabled;
 #endif
 
 /*
-- 
cgit v1.2.3