From 0d7a6c301af8851542a9ec66a7dab571a979c057 Mon Sep 17 00:00:00 2001
From: Nagalakshmi <quic_nmenthul@quicinc.com>
Date: Tue, 29 Nov 2022 22:48:35 -0800
Subject: qcacld-3.0: Fix OOB in wma_scan_roam.c

Currently in wma_extscan_hotlist_match_event_handler
API, dest_hotlist get memory allocation based on numap
which takes value from event->total_entries.
But numap is limited to WMA_EXTSCAN_MAX_HOTLIST_ENTRIES
and event->total_entries more than WMA_EXTSCAN_MAX_HOTLIST_ENTRIES
can cause out of bound issue.

Fix is to populate dest_hotlist->numOfAps from numap
instead of event->total_entries to avoid any out of bound issue.

Change-Id: I756f7e4a4dcd454508bba83d4a8bbbb139530905
CRs-Fixed: 3346781
---
 drivers/staging/qcacld-3.0/core/wma/src/wma_scan_roam.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/qcacld-3.0/core/wma/src/wma_scan_roam.c b/drivers/staging/qcacld-3.0/core/wma/src/wma_scan_roam.c
index f6b3c34c274e..99795c3d1785 100644
--- a/drivers/staging/qcacld-3.0/core/wma/src/wma_scan_roam.c
+++ b/drivers/staging/qcacld-3.0/core/wma/src/wma_scan_roam.c
@@ -5140,7 +5140,7 @@ int wma_extscan_hotlist_match_event_handler(void *handle,
 		return -ENOMEM;
 	}
 	dest_ap = &dest_hotlist->ap[0];
-	dest_hotlist->numOfAps = event->total_entries;
+	dest_hotlist->numOfAps = numap;
 	dest_hotlist->requestId = event->config_request_id;
 
 	if (event->first_entry_index +
-- 
cgit v1.2.3


From d6038d6da57f766f4c9bb946a107e48617b414ff Mon Sep 17 00:00:00 2001
From: Soumya Managoli <quic_c_smanag@quicinc.com>
Date: Fri, 6 Jan 2023 14:37:20 +0530
Subject: ASoC: msm-pcm-q6-v2: Add dsp buf check

Current logic copies user buf size of data
from the avail dsp buf at a given offset.
If this offset returned from DSP in READ_DONE event
goes out of bounds or is corrupted, then it can lead to
out of bounds DSP buffer access, resulting in memory fault.
Fix is to add check for this buf offset, if it is within
the buf size range.

Change-Id: I7753cc6db394704dbb959477150141d42b836bef
Signed-off-by: Soumya Managoli <quic_c_smanag@quicinc.com>
---
 sound/soc/msm/qdsp6v2/msm-pcm-q6-v2.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/sound/soc/msm/qdsp6v2/msm-pcm-q6-v2.c b/sound/soc/msm/qdsp6v2/msm-pcm-q6-v2.c
index 487aaf2390c0..5f4225e675ad 100644
--- a/sound/soc/msm/qdsp6v2/msm-pcm-q6-v2.c
+++ b/sound/soc/msm/qdsp6v2/msm-pcm-q6-v2.c
@@ -1,4 +1,5 @@
 /* Copyright (c) 2012-2020, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 and
@@ -995,6 +996,14 @@ static int msm_pcm_capture_copy(struct snd_pcm_substream *substream,
 			xfer = size;
 		offset = prtd->in_frame_info[idx].offset;
 		pr_debug("Offset value = %d\n", offset);
+
+		if (offset >= size) {
+			pr_err("%s: Invalid dsp buf offset\n", __func__);
+			ret = -EFAULT;
+			q6asm_cpu_buf_release(OUT, prtd->audio_client);
+			goto fail;
+		}
+
 		if (copy_to_user(buf, bufptr+offset, xfer)) {
 			pr_err("Failed to copy buf to user\n");
 			ret = -EFAULT;
-- 
cgit v1.2.3


From 094b738f46c80c56d03d923c3e780e071abe34b0 Mon Sep 17 00:00:00 2001
From: Michael Bestas <mkbestas@lineageos.org>
Date: Tue, 23 May 2023 18:44:59 +0300
Subject: power: qpnp-smb2: Implement battery charging_enabled node

Change-Id: Id08c169f0c507390eab070d1ae77bfb992b50b81
---
 drivers/power/supply/qcom/qpnp-smb2.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/power/supply/qcom/qpnp-smb2.c b/drivers/power/supply/qcom/qpnp-smb2.c
index 5fae7b99d88f..b0704af49353 100644
--- a/drivers/power/supply/qcom/qpnp-smb2.c
+++ b/drivers/power/supply/qcom/qpnp-smb2.c
@@ -917,6 +917,7 @@ static int smb2_init_dc_psy(struct smb2 *chip)
  *************************/
 
 static enum power_supply_property smb2_batt_props[] = {
+	POWER_SUPPLY_PROP_CHARGING_ENABLED,
 	POWER_SUPPLY_PROP_INPUT_SUSPEND,
 	POWER_SUPPLY_PROP_STATUS,
 	POWER_SUPPLY_PROP_HEALTH,
@@ -967,6 +968,9 @@ static int smb2_batt_get_prop(struct power_supply *psy,
 	case POWER_SUPPLY_PROP_PRESENT:
 		rc = smblib_get_prop_batt_present(chg, val);
 		break;
+	case POWER_SUPPLY_PROP_CHARGING_ENABLED:
+		val->intval = !get_effective_result(chg->chg_disable_votable);
+		break;
 	case POWER_SUPPLY_PROP_INPUT_SUSPEND:
 		rc = smblib_get_prop_input_suspend(chg, val);
 		break;
@@ -1079,6 +1083,9 @@ static int smb2_batt_set_prop(struct power_supply *psy,
 	struct smb_charger *chg = power_supply_get_drvdata(psy);
 
 	switch (prop) {
+	case POWER_SUPPLY_PROP_CHARGING_ENABLED:
+		vote(chg->chg_disable_votable, USER_VOTER, !!!val->intval, 0);
+		break;
 	case POWER_SUPPLY_PROP_INPUT_SUSPEND:
 		rc = smblib_set_prop_input_suspend(chg, val);
 		break;
@@ -1163,6 +1170,7 @@ static int smb2_batt_prop_is_writeable(struct power_supply *psy,
 		enum power_supply_property psp)
 {
 	switch (psp) {
+	case POWER_SUPPLY_PROP_CHARGING_ENABLED:
 	case POWER_SUPPLY_PROP_INPUT_SUSPEND:
 	case POWER_SUPPLY_PROP_SYSTEM_TEMP_LEVEL:
 	case POWER_SUPPLY_PROP_CAPACITY:
-- 
cgit v1.2.3


From 07f7c9961c7cd0090dd1771f61245746af7fe1ea Mon Sep 17 00:00:00 2001
From: Umang Agrawal <uagrawal@codeaurora.org>
Date: Wed, 20 Jun 2018 14:48:05 +0530
Subject: power: smb-lib: Fix mutex acquisition deadlock on PD hard reset

Mutex acquisition deadlock can happen while cancelling cc_dettach
work during pd_hard_reset from the function usbin_plugin_hard_reset
_locked on vbus rise which is called in the same lock context that
we try to acquire in the cc_dettach work routine.
Check if cc_dettach work is running during pd_hard_reset and use
trylock instead of mutex_lock to prevent any deadlock if mutext is
already held.

Change-Id: I5530deb9e654d3d12ba1b4bc6876f36127a0d5a5
Signed-off-by: Umang Agrawal <uagrawal@codeaurora.org>
---
 drivers/power/supply/qcom/smb-lib.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/power/supply/qcom/smb-lib.c b/drivers/power/supply/qcom/smb-lib.c
index 81623c65ea8e..1782f23fafa7 100644
--- a/drivers/power/supply/qcom/smb-lib.c
+++ b/drivers/power/supply/qcom/smb-lib.c
@@ -4457,6 +4457,7 @@ static void rdstd_cc2_detach_work(struct work_struct *work)
 {
 	int rc;
 	u8 stat4, stat5;
+	bool lock = false;
 	struct smb_charger *chg = container_of(work, struct smb_charger,
 						rdstd_cc2_detach_work);
 
@@ -4519,9 +4520,28 @@ static void rdstd_cc2_detach_work(struct work_struct *work)
 	rc = smblib_masked_write(chg, TYPE_C_INTRPT_ENB_SOFTWARE_CTRL_REG,
 						EXIT_SNK_BASED_ON_CC_BIT, 0);
 	smblib_reg_block_restore(chg, cc2_detach_settings);
-	mutex_lock(&chg->lock);
+
+	/*
+	 * Mutex acquisition deadlock can happen while cancelling this work
+	 * during pd_hard_reset from the function smblib_cc2_sink_removal_exit
+	 * which is called in the same lock context that we try to acquire in
+	 * this work routine.
+	 * Check if this work is running during pd_hard_reset and use trylock
+	 * instead of mutex_lock to prevent any deadlock if mutext is already
+	 * held.
+	 */
+	if (chg->pd_hard_reset) {
+		if (mutex_trylock(&chg->lock))
+			lock = true;
+	} else {
+		mutex_lock(&chg->lock);
+		lock = true;
+	}
+
 	smblib_usb_typec_change(chg);
-	mutex_unlock(&chg->lock);
+
+	if (lock)
+		mutex_unlock(&chg->lock);
 	return;
 
 rerun:
-- 
cgit v1.2.3


From 1daa7ea39076e334a07ffb90f55ae33398b3477f Mon Sep 17 00:00:00 2001
From: Archana Sathyakumar <asathyak@codeaurora.org>
Date: Wed, 26 Jul 2017 07:37:51 -0600
Subject: pinctrl: qcom: Update irq handle for GPIO pins

Default handle_irq for tlmm irq chip is handle_edge_irq. For direct
connect GPIOs, the handle_irq is not changed unlike non-direct connect
GPIOs. This causes an interrupt storm for level trigger types as
handle_edge_irq does not mask the interrupt within the function. Change
this to handle_fasteoi_irq such that both level and edge interrupts are
handled correctly.

Change-Id: I79f0d4d92145f85a8043875301400ecf36b46c7b
Signed-off-by: Archana Sathyakumar <asathyak@codeaurora.org>
---
 drivers/pinctrl/qcom/pinctrl-msm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c
index 22496ad167a0..d4a1f5378ac5 100644
--- a/drivers/pinctrl/qcom/pinctrl-msm.c
+++ b/drivers/pinctrl/qcom/pinctrl-msm.c
@@ -905,7 +905,7 @@ static int msm_gpio_init(struct msm_pinctrl *pctrl)
 	ret = gpiochip_irqchip_add(chip,
 				   &msm_gpio_irq_chip,
 				   0,
-				   handle_edge_irq,
+				   handle_fasteoi_irq,
 				   IRQ_TYPE_NONE);
 	if (ret) {
 		dev_err(pctrl->dev, "Failed to add irqchip to gpiochip\n");
-- 
cgit v1.2.3


From 400383059868487869772b1c68ab8db4b6c81cbb Mon Sep 17 00:00:00 2001
From: Prasad Sodagudi <psodagud@codeaurora.org>
Date: Mon, 24 Sep 2018 16:25:55 -0700
Subject: kernel: time: Add delay after cpu_relax() in tight loops

Tight loops of spin_lock_irqsave() and spin_unlock_irqrestore()
in timer and hrtimer are causing scheduling delays. Add delay of
few nano seconds after cpu_relax in the timer/hrtimer tight loops.

Change-Id: Iaa0ab92da93f7b245b1d922b6edca2bebdc0fbce
Signed-off-by: Prasad Sodagudi <psodagud@codeaurora.org>
---
 include/linux/time.h     | 1 +
 kernel/time/alarmtimer.c | 2 ++
 kernel/time/hrtimer.c    | 3 +++
 kernel/time/timer.c      | 2 ++
 4 files changed, 8 insertions(+)

diff --git a/include/linux/time.h b/include/linux/time.h
index 62cc50700004..cbb55e004342 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -9,6 +9,7 @@
 extern struct timezone sys_tz;
 
 #define TIME_T_MAX	(time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1)
+#define TIMER_LOCK_TIGHT_LOOP_DELAY_NS 350
 
 static inline int timespec_equal(const struct timespec *a,
                                  const struct timespec *b)
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 4171fee2d4ec..612c97156df7 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -25,6 +25,7 @@
 #include <linux/posix-timers.h>
 #include <linux/workqueue.h>
 #include <linux/freezer.h>
+#include <linux/delay.h>
 
 #ifdef CONFIG_MSM_PM
 #include "lpm-levels.h"
@@ -503,6 +504,7 @@ int alarm_cancel(struct alarm *alarm)
 		if (ret >= 0)
 			return ret;
 		cpu_relax();
+		ndelay(TIMER_LOCK_TIGHT_LOOP_DELAY_NS);
 	}
 }
 EXPORT_SYMBOL_GPL(alarm_cancel);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 79fadcad21ff..6bd4247198e2 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -49,6 +49,7 @@
 #include <linux/sched/deadline.h>
 #include <linux/timer.h>
 #include <linux/freezer.h>
+#include <linux/delay.h>
 
 #include <asm/uaccess.h>
 
@@ -156,6 +157,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
 			raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
 		}
 		cpu_relax();
+		ndelay(TIMER_LOCK_TIGHT_LOOP_DELAY_NS);
 	}
 }
 
@@ -1061,6 +1063,7 @@ int hrtimer_cancel(struct hrtimer *timer)
 		if (ret >= 0)
 			return ret;
 		cpu_relax();
+		ndelay(TIMER_LOCK_TIGHT_LOOP_DELAY_NS);
 	}
 }
 EXPORT_SYMBOL_GPL(hrtimer_cancel);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 67646a316436..2d8b82d90c9f 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -798,6 +798,7 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 			spin_unlock_irqrestore(&base->lock, *flags);
 		}
 		cpu_relax();
+		ndelay(TIMER_LOCK_TIGHT_LOOP_DELAY_NS);
 	}
 }
 
@@ -1148,6 +1149,7 @@ int del_timer_sync(struct timer_list *timer)
 		if (ret >= 0)
 			return ret;
 		cpu_relax();
+		ndelay(TIMER_LOCK_TIGHT_LOOP_DELAY_NS);
 	}
 }
 EXPORT_SYMBOL(del_timer_sync);
-- 
cgit v1.2.3


From 58c453484f7ef1b81d9da4a5696e849a92e61883 Mon Sep 17 00:00:00 2001
From: Giovanni Gherdovich <ggherdovich@suse.cz>
Date: Fri, 5 Aug 2016 10:21:56 +0200
Subject: sched/cputime: Mitigate performance regression in
 times()/clock_gettime()

Commit:

  6e998916dfe3 ("sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency")

fixed a problem whereby clock_nanosleep() followed by clock_gettime() could
allow a task to wake early. It addressed the problem by calling the scheduling
classes update_curr() when the cputimer starts.

Said change induced a considerable performance regression on the syscalls
times() and clock_gettimes(CLOCK_PROCESS_CPUTIME_ID). There are some
debuggers and applications that monitor their own performance that
accidentally depend on the performance of these specific calls.

This patch mitigates the performace loss by prefetching data in the CPU
cache, as stalls due to cache misses appear to be where most time is spent
in our benchmarks.

Here are the performance gain of this patch over v4.7-rc7 on a Sandy Bridge
box with 32 logical cores and 2 NUMA nodes. The test is repeated with a
variable number of threads, from 2 to 4*num_cpus; the results are in
seconds and correspond to the average of 10 runs; the percentage gain is
computed with (before-after)/before so a positive value is an improvement
(it's faster). The improvement varies between a few percents for 5-20
threads and more than 10% for 2 or >20 threads.

pound_clock_gettime:

    threads       4.7-rc7     patched 4.7-rc7
    [num]         [secs]      [secs (percent)]
      2           3.48        3.06 ( 11.83%)
      5           3.33        3.25 (  2.40%)
      8           3.37        3.26 (  3.30%)
     12           3.32        3.37 ( -1.60%)
     21           4.01        3.90 (  2.74%)
     30           3.63        3.36 (  7.41%)
     48           3.71        3.11 ( 16.27%)
     79           3.75        3.16 ( 15.74%)
    110           3.81        3.25 ( 14.80%)
    128           3.88        3.31 ( 14.76%)

pound_times:

    threads       4.7-rc7     patched 4.7-rc7
    [num]         [secs]      [secs (percent)]
      2           3.65        3.25 ( 11.03%)
      5           3.45        3.17 (  7.92%)
      8           3.52        3.22 (  8.69%)
     12           3.29        3.36 ( -2.04%)
     21           4.07        3.92 (  3.78%)
     30           3.87        3.40 ( 12.17%)
     48           3.79        3.16 ( 16.61%)
     79           3.88        3.28 ( 15.42%)
    110           3.90        3.38 ( 13.35%)
    128           4.00        3.38 ( 15.45%)

pound_clock_gettime and pound_clock_gettime are two benchmarks included in
the MMTests framework. They launch a given number of threads which
repeatedly call times() or clock_gettimes(). The results above can be
reproduced with cloning MMTests from github.com and running the "poundtime"
workload:

  $ git clone https://github.com/gormanm/mmtests.git
  $ cd mmtests
  $ cp configs/config-global-dhp__workload_poundtime config
  $ ./run-mmtests.sh --run-monitor $(uname -r)

The above will run "poundtime" measuring the kernel currently running on
the machine; Once a new kernel is installed and the machine rebooted,
running again

  $ cd mmtests
  $ ./run-mmtests.sh --run-monitor $(uname -r)

will produce results to compare with. A comparison table will be output
with:

  $ cd mmtests/work/log
  $ ../../compare-kernels.sh

the table will contain a lot of entries; grepping for "Amean" (as in
"arithmetic mean") will give the tables presented above. The source code
for the two benchmarks is reported at the end of this changelog for
clairity.

The cache misses addressed by this patch were found using a combination of
`perf top`, `perf record` and `perf annotate`. The incriminated lines were
found to be

    struct sched_entity *curr = cfs_rq->curr;

and

    delta_exec = now - curr->exec_start;

in the function update_curr() from kernel/sched/fair.c. This patch
prefetches the data from memory just before update_curr is called in the
interested execution path.

A comparison of the total number of cycles before and after the patch
follows; the data is obtained using `perf stat -r 10 -ddd <program>`
running over the same sequence of number of threads used above (a positive
gain is an improvement):

  threads   cycles before                 cycles after                gain

    2      19,699,563,964  +-1.19%      17,358,917,517  +-1.85%      11.88%
    5      47,401,089,566  +-2.96%      45,103,730,829  +-0.97%       4.85%
    8      80,923,501,004  +-3.01%      71,419,385,977  +-0.77%      11.74%
   12     112,326,485,473  +-0.47%     110,371,524,403  +-0.47%       1.74%
   21     193,455,574,299  +-0.72%     180,120,667,904  +-0.36%       6.89%
   30     315,073,519,013  +-1.64%     271,222,225,950  +-1.29%      13.92%
   48     321,969,515,332  +-1.48%     273,353,977,321  +-1.16%      15.10%
   79     337,866,003,422  +-0.97%     289,462,481,538  +-1.05%      14.33%
  110     338,712,691,920  +-0.78%     290,574,233,170  +-0.77%      14.21%
  128     348,384,794,006  +-0.50%     292,691,648,206  +-0.66%      15.99%

A comparison of cache miss vs total cache loads ratios, before and after
the patch (again from the `perf stat -r 10 -ddd <program>` tables):

  threads   L1 misses/total*100     L1 misses/total*100            gain
		         before                   after
      2           7.43  +-4.90%           7.36  +-4.70%           0.94%
      5          13.09  +-4.74%          13.52  +-3.73%          -3.28%
      8          13.79  +-5.61%          12.90  +-3.27%           6.45%
     12          11.57  +-2.44%           8.71  +-1.40%          24.72%
     21          12.39  +-3.92%           9.97  +-1.84%          19.53%
     30          13.91  +-2.53%          11.73  +-2.28%          15.67%
     48          13.71  +-1.59%          12.32  +-1.97%          10.14%
     79          14.44  +-0.66%          13.40  +-1.06%           7.20%
    110          15.86  +-0.50%          14.46  +-0.59%           8.83%
    128          16.51  +-0.32%          15.06  +-0.78%           8.78%

As a final note, the following shows the evolution of performance figures
in the "poundtime" benchmark and pinpoints commit 6e998916dfe3
("sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency") as a
major source of degradation, mostly unaddressed to this day (figures
expressed in seconds).

pound_clock_gettime:

  threads   parent of         6e998916dfe3        4.7-rc7
	    6e998916dfe3            itself
    2        2.23          3.68 ( -64.56%)        3.48 (-55.48%)
    5        2.83          3.78 ( -33.42%)        3.33 (-17.43%)
    8        2.84          4.31 ( -52.12%)        3.37 (-18.76%)
    12       3.09          3.61 ( -16.74%)        3.32 ( -7.17%)
    21       3.14          4.63 ( -47.36%)        4.01 (-27.71%)
    30       3.28          5.75 ( -75.37%)        3.63 (-10.80%)
    48       3.02          6.05 (-100.56%)        3.71 (-22.99%)
    79       2.88          6.30 (-118.90%)        3.75 (-30.26%)
    110      2.95          6.46 (-119.00%)        3.81 (-29.24%)
    128      3.05          6.42 (-110.08%)        3.88 (-27.04%)

pound_times:

  threads   parent of         6e998916dfe3        4.7-rc7
	    6e998916dfe3            itself
    2        2.27          3.73 ( -64.71%)        3.65 (-61.14%)
    5        2.78          3.77 ( -35.56%)        3.45 (-23.98%)
    8        2.79          4.41 ( -57.71%)        3.52 (-26.05%)
    12       3.02          3.56 ( -17.94%)        3.29 ( -9.08%)
    21       3.10          4.61 ( -48.74%)        4.07 (-31.34%)
    30       3.33          5.75 ( -72.53%)        3.87 (-16.01%)
    48       2.96          6.06 (-105.04%)        3.79 (-28.10%)
    79       2.88          6.24 (-116.83%)        3.88 (-34.81%)
    110      2.98          6.37 (-114.08%)        3.90 (-31.12%)
    128      3.10          6.35 (-104.61%)        4.00 (-28.87%)

The source code of the two benchmarks follows. To compile the two:

  NR_THREADS=42
  for FILE in pound_times pound_clock_gettime; do
      gcc -lrt -O2 -lpthread -DNUM_THREADS=$NR_THREADS $FILE.c -o $FILE
  done

==== BEGIN pound_times.c ====

struct tms start;

void *pound (void *threadid)
{
  struct tms end;
  int oldutime = 0;
  int utime;
  int i;
  for (i = 0; i < 5000000 / NUM_THREADS; i++) {
          times(&end);
          utime = ((int)end.tms_utime - (int)start.tms_utime);
          if (oldutime > utime) {
            printf("utime decreased, was %d, now %d!\n", oldutime, utime);
          }
          oldutime = utime;
  }
  pthread_exit(NULL);
}

int main()
{
  pthread_t th[NUM_THREADS];
  long i;
  times(&start);
  for (i = 0; i < NUM_THREADS; i++) {
    pthread_create (&th[i], NULL, pound, (void *)i);
  }
  pthread_exit(NULL);
  return 0;
}
==== END pound_times.c ====

==== BEGIN pound_clock_gettime.c ====

void *pound (void *threadid)
{
	struct timespec ts;
	int rc, i;
	unsigned long prev = 0, this = 0;

	for (i = 0; i < 5000000 / NUM_THREADS; i++) {
		rc = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
		if (rc < 0)
			perror("clock_gettime");
		this = (ts.tv_sec * 1000000000) + ts.tv_nsec;
		if (0 && this < prev)
			printf("%lu ns timewarp at iteration %d\n", prev - this, i);
		prev = this;
	}
	pthread_exit(NULL);
}

int main()
{
	pthread_t th[NUM_THREADS];
	long rc, i;
	pid_t pgid;

	for (i = 0; i < NUM_THREADS; i++) {
		rc = pthread_create(&th[i], NULL, pound, (void *)i);
		if (rc < 0)
			perror("pthread_create");
	}

	pthread_exit(NULL);
	return 0;
}
==== END pound_clock_gettime.c ====

Suggested-by: Mike Galbraith <mgalbraith@suse.de>
Change-Id: Iad82d9f31c92e50e1e3b1339892512526ceb0acf
Signed-off-by: Giovanni Gherdovich <ggherdovich@suse.cz>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1470385316-15027-2-git-send-email-ggherdovich@suse.cz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6c3f46e759d2..18afb0fe2704 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -78,6 +78,7 @@
 #include <linux/irq.h>
 #include <linux/sched/core_ctl.h>
 #include <linux/cpufreq_times.h>
+#include <linux/prefetch.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -3133,6 +3134,23 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
 
+/*
+ * The function fair_sched_class.update_curr accesses the struct curr
+ * and its field curr->exec_start; when called from task_sched_runtime(),
+ * we observe a high rate of cache misses in practice.
+ * Prefetching this data results in improved performance.
+ */
+static inline void prefetch_curr_exec_start(struct task_struct *p)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct sched_entity *curr = (&p->se)->cfs_rq->curr;
+#else
+	struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
+#endif
+	prefetch(curr);
+	prefetch(&curr->exec_start);
+}
+
 /*
  * Return accounted runtime for the task.
  * In case the task is currently running, return the runtime plus current's
@@ -3167,6 +3185,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	 * thread, breaking clock_gettime().
 	 */
 	if (task_current(rq, p) && task_on_rq_queued(p)) {
+		prefetch_curr_exec_start(p);
 		update_rq_clock(rq);
 		p->sched_class->update_curr(rq);
 	}
-- 
cgit v1.2.3


From f31078b5825f71499fa95b85babf6ac8c776c37d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jun 2017 01:37:38 +0200
Subject: genirq: Introduce effective affinity mask

There is currently no way to evaluate the effective affinity mask of a
given interrupt. Many irq chips allow only a single target CPU or a subset
of CPUs in the affinity mask.

Updating the mask at the time of setting the affinity to the subset would
be counterproductive because information for cpu hotplug about assigned
interrupt affinities gets lost. On CPU hotplug it's also pointless to force
migrate an interrupt, which is not targeted at the CPU effectively. But
currently the information is not available.

Provide a seperate mask to be updated by the irq_chip->irq_set_affinity()
implementations. Implement the read only proc files so the user can see the
effective mask as well w/o trying to deduce it from /proc/interrupts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/20170619235446.247834245@linutronix.de
Change-Id: Ibeec0031edb532d52cb411286f785aec160d6139
---
 include/linux/irq.h  | 29 +++++++++++++++++
 kernel/irq/Kconfig   |  4 +++
 kernel/irq/irqdesc.c | 14 ++++++++
 kernel/irq/proc.c    | 90 ++++++++++++++++++++++++++++++++++++++++++++++++----
 4 files changed, 130 insertions(+), 7 deletions(-)

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 8da001eb82aa..0e57f41bde84 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -136,6 +136,9 @@ struct irq_domain;
  * @node:		node index useful for balancing
  * @handler_data:	per-IRQ data for the irq_chip methods
  * @affinity:		IRQ affinity on SMP
+ * @effective_affinity:	The effective IRQ affinity on SMP as some irq
+ *			chips do not allow multi CPU destinations.
+ *			A subset of @affinity.
  * @msi_desc:		MSI descriptor
  */
 struct irq_common_data {
@@ -146,6 +149,9 @@ struct irq_common_data {
 	void			*handler_data;
 	struct msi_desc		*msi_desc;
 	cpumask_var_t		affinity;
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+	cpumask_var_t		effective_affinity;
+#endif
 };
 
 /**
@@ -690,6 +696,29 @@ static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d)
 	return d->common->affinity;
 }
 
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+static inline
+struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
+{
+	return d->common->effective_affinity;
+}
+static inline void irq_data_update_effective_affinity(struct irq_data *d,
+						      const struct cpumask *m)
+{
+	cpumask_copy(d->common->effective_affinity, m);
+}
+#else
+static inline void irq_data_update_effective_affinity(struct irq_data *d,
+						      const struct cpumask *m)
+{
+}
+static inline
+struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
+{
+	return d->common->affinity;
+}
+#endif
+
 unsigned int arch_dynirq_lower_bound(unsigned int from);
 
 int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 3b48dab80164..5d00ba9af4ec 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -21,6 +21,10 @@ config GENERIC_IRQ_SHOW
 config GENERIC_IRQ_SHOW_LEVEL
        bool
 
+# Supports effective affinity mask
+config GENERIC_IRQ_EFFECTIVE_AFF_MASK
+       bool
+
 # Facility to allocate a hardware interrupt. This is legacy support
 # and should not be used in new code. Use irq domains instead.
 config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 52fbf88cd2d8..e0de4682f57a 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -43,8 +43,19 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
 				     gfp, node))
 		return -ENOMEM;
 
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+	if (!zalloc_cpumask_var_node(&desc->irq_common_data.effective_affinity,
+				     GFP_KERNEL, node)) {
+		free_cpumask_var(desc->irq_common_data.affinity);
+		return -ENOMEM;
+	}
+#endif
+
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+		free_cpumask_var(desc->irq_common_data.effective_affinity);
+#endif
 		free_cpumask_var(desc->irq_common_data.affinity);
 		return -ENOMEM;
 	}
@@ -127,6 +138,9 @@ static void free_masks(struct irq_desc *desc)
 	free_cpumask_var(desc->pending_mask);
 #endif
 	free_cpumask_var(desc->irq_common_data.affinity);
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+	free_cpumask_var(desc->irq_common_data.effective_affinity);
+#endif
 }
 #else
 static inline void free_masks(struct irq_desc *desc) { }
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index b05509af0352..9da4f2b075d1 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -37,19 +37,47 @@ static struct proc_dir_entry *root_irq_dir;
 
 #ifdef CONFIG_SMP
 
+enum {
+	AFFINITY,
+	AFFINITY_LIST,
+	EFFECTIVE,
+	EFFECTIVE_LIST,
+};
+
 static int show_irq_affinity(int type, struct seq_file *m, void *v)
 {
 	struct irq_desc *desc = irq_to_desc((long)m->private);
-	const struct cpumask *mask = desc->irq_common_data.affinity;
+	const struct cpumask *mask;
 
+	switch (type) {
+	case AFFINITY:
+	case AFFINITY_LIST:
+		mask = desc->irq_common_data.affinity;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (irqd_is_setaffinity_pending(&desc->irq_data))
-		mask = desc->pending_mask;
+		if (irqd_is_setaffinity_pending(&desc->irq_data))
+			mask = desc->pending_mask;
 #endif
-	if (type)
+		break;
+	case EFFECTIVE:
+	case EFFECTIVE_LIST:
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+		mask = desc->irq_common_data.effective_affinity;
+		break;
+#else
+		return -EINVAL;
+#endif
+	};
+
+	switch (type) {
+	case AFFINITY_LIST:
+	case EFFECTIVE_LIST:
 		seq_printf(m, "%*pbl\n", cpumask_pr_args(mask));
-	else
+		break;
+	case AFFINITY:
+	case EFFECTIVE:
 		seq_printf(m, "%*pb\n", cpumask_pr_args(mask));
+		break;
+	}
 	return 0;
 }
 
@@ -80,12 +108,12 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
 int no_irq_affinity;
 static int irq_affinity_proc_show(struct seq_file *m, void *v)
 {
-	return show_irq_affinity(0, m, v);
+	return show_irq_affinity(AFFINITY, m, v);
 }
 
 static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
 {
-	return show_irq_affinity(1, m, v);
+	return show_irq_affinity(AFFINITY_LIST, m, v);
 }
 
 
@@ -188,6 +216,44 @@ static const struct file_operations irq_affinity_list_proc_fops = {
 	.write		= irq_affinity_list_proc_write,
 };
 
+#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+static int irq_effective_aff_proc_show(struct seq_file *m, void *v)
+{
+	return show_irq_affinity(EFFECTIVE, m);
+}
+
+static int irq_effective_aff_list_proc_show(struct seq_file *m, void *v)
+{
+	return show_irq_affinity(EFFECTIVE_LIST, m);
+}
+
+static int irq_effective_aff_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, irq_effective_aff_proc_show, PDE_DATA(inode));
+}
+
+static int irq_effective_aff_list_proc_open(struct inode *inode,
+					    struct file *file)
+{
+	return single_open(file, irq_effective_aff_list_proc_show,
+			   PDE_DATA(inode));
+}
+
+static const struct file_operations irq_effective_aff_proc_fops = {
+	.open		= irq_effective_aff_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static const struct file_operations irq_effective_aff_list_proc_fops = {
+	.open		= irq_effective_aff_list_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
 static int default_affinity_show(struct seq_file *m, void *v)
 {
 	seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity));
@@ -368,6 +434,12 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 
 	proc_create_data("node", 0444, desc->dir,
 			 &irq_node_proc_fops, (void *)(long)irq);
+# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+	proc_create_data("effective_affinity", 0444, desc->dir,
+			 &irq_effective_aff_proc_fops, (void *)(long)irq);
+	proc_create_data("effective_affinity_list", 0444, desc->dir,
+			 &irq_effective_aff_list_proc_fops, (void *)(long)irq);
+# endif
 #endif
 
 	proc_create_data("spurious", 0444, desc->dir,
@@ -388,6 +460,10 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
 	remove_proc_entry("affinity_hint", desc->dir);
 	remove_proc_entry("smp_affinity_list", desc->dir);
 	remove_proc_entry("node", desc->dir);
+# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
+	remove_proc_entry("effective_affinity", desc->dir);
+	remove_proc_entry("effective_affinity_list", desc->dir);
+# endif
 #endif
 	remove_proc_entry("spurious", desc->dir);
 
-- 
cgit v1.2.3


From 9209b5556f6acc6b2c0c29135db247f90a3eb78e Mon Sep 17 00:00:00 2001
From: Pavankumar Kondeti <pkondeti@codeaurora.org>
Date: Tue, 4 Dec 2018 12:04:59 +0530
Subject: power: qos: Use effective affinity mask

PM_QOS_REQ_AFFINE_IRQ request is supposed to apply the QoS vote
for the CPU(s) on which the attached interrupt arrives. Currently
the QoS vote is applied to all the CPUs present in the IRQ
affinity mask i.e desc->irq_data.common->affinity. However some
chips configure only a single CPU from this affinity mask to
receive the IRQ.  This information is present in effective
affinity mask of an IRQ. Start using it so that a QoS vote is
not applied to other CPUs on which the IRQ never comes but
present in the affinity mask.

Change-Id: If26aa23bebe4a7d07ffedb5ff833ccdb4f4fb6ea
Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
---
 kernel/power/qos.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index e6eceb0aa496..49dc710d4a3a 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -545,19 +545,29 @@ static void pm_qos_irq_release(struct kref *ref)
 }
 
 static void pm_qos_irq_notify(struct irq_affinity_notify *notify,
-		const cpumask_t *mask)
+		const cpumask_t *unused_mask)
 {
 	unsigned long flags;
 	struct pm_qos_request *req = container_of(notify,
 					struct pm_qos_request, irq_notify);
 	struct pm_qos_constraints *c =
 				pm_qos_array[req->pm_qos_class]->constraints;
+	struct irq_desc *desc = irq_to_desc(req->irq);
+	struct cpumask *new_affinity =
+			irq_data_get_effective_affinity_mask(&desc->irq_data);
+	bool affinity_changed = false;
 
 	spin_lock_irqsave(&pm_qos_lock, flags);
-	cpumask_copy(&req->cpus_affine, mask);
+	if (!cpumask_equal(&req->cpus_affine, new_affinity)) {
+		cpumask_copy(&req->cpus_affine, new_affinity);
+		affinity_changed = true;
+	}
+
 	spin_unlock_irqrestore(&pm_qos_lock, flags);
 
-	pm_qos_update_target(c, req, PM_QOS_UPDATE_REQ, req->node.prio);
+	if (affinity_changed)
+		pm_qos_update_target(c, req, PM_QOS_UPDATE_REQ,
+				     req->node.prio);
 }
 #endif
 
@@ -601,9 +611,17 @@ void pm_qos_add_request(struct pm_qos_request *req,
 
 			if (!desc)
 				return;
-			mask = desc->irq_data.common->affinity;
 
-			/* Get the current affinity */
+			/*
+			 * If the IRQ is not started, the effective affinity
+			 * won't be set. So fallback to the default affinity.
+			 */
+			mask = irq_data_get_effective_affinity_mask(
+						&desc->irq_data);
+			if (cpumask_empty(mask))
+				mask = irq_data_get_affinity_mask(
+						&desc->irq_data);
+
 			cpumask_copy(&req->cpus_affine, mask);
 			req->irq_notify.irq = req->irq;
 			req->irq_notify.notify = pm_qos_irq_notify;
-- 
cgit v1.2.3


From 602bf4096dabb119eb3e176353c4607030fbb1c7 Mon Sep 17 00:00:00 2001
From: Pavankumar Kondeti <pkondeti@codeaurora.org>
Date: Mon, 3 Dec 2018 15:05:26 +0530
Subject: genirq: Honour IRQ's affinity hint during migration

An IRQ affinity is broken during hotplug/isolation when there
are no online and un-isolated CPUs in the current affinity mask.
An online and un-isolated CPU from the irq_default_affinity mask
(i.e /proc/irq/default_smp_affinity) is used as the current
affinity. However Individual IRQs can have their affinity hint
set via irq_set_affinity_hint() API. When such hint is available,
use it instead of the irq_default_affinity which is a system level
setting.

Change-Id: I53a537582ec4e1aed0c59b49f4fd5b6ca7c0c332
Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
---
 kernel/irq/cpuhotplug.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 4684b7595e63..9fad2dc50452 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -36,6 +36,10 @@ static bool migrate_one_irq(struct irq_desc *desc)
 	affinity = &available_cpus;
 
 	if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
+		const struct cpumask *default_affinity;
+
+		default_affinity = desc->affinity_hint ? : irq_default_affinity;
+
 		/*
 		 * The order of preference for selecting a fallback CPU is
 		 *
@@ -45,9 +49,9 @@ static bool migrate_one_irq(struct irq_desc *desc)
 		 */
 		cpumask_andnot(&available_cpus, cpu_online_mask,
 							cpu_isolated_mask);
-		if (cpumask_intersects(&available_cpus, irq_default_affinity))
+		if (cpumask_intersects(&available_cpus, default_affinity))
 			cpumask_and(&available_cpus, &available_cpus,
-							irq_default_affinity);
+							default_affinity);
 		else if (cpumask_empty(&available_cpus))
 			affinity = cpu_online_mask;
 
-- 
cgit v1.2.3


From 13e66175965635ed74c948f232a32033230cc5d0 Mon Sep 17 00:00:00 2001
From: Pavankumar Kondeti <pkondeti@codeaurora.org>
Date: Thu, 28 Feb 2019 10:40:39 +0530
Subject: cpuset: Restore tasks affinity while moving across cpusets

When tasks move across cpusets, the current affinity settings
are lost. Cache the task affinity and restore it during cpuset
migration. The restoring happens only when the cached affinity
is subset of the current cpuset settings.

Change-Id: I6c2ec1d5e3d994e176926d94b9e0cc92418020cc
Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
---
 include/linux/init_task.h |  1 +
 include/linux/sched.h     |  1 +
 kernel/cpuset.c           | 18 ++++++++++++++++--
 kernel/sched/core.c       |  4 ++++
 4 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 021b1e9ff6cd..8aed56931361 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -208,6 +208,7 @@ extern struct task_group root_task_group;
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.nr_cpus_allowed= NR_CPUS,					\
+	.cpus_requested	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
 	.restart_block = {						\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 70c1f7f9e4fa..9cb6964d178e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1700,6 +1700,7 @@ struct task_struct {
 	unsigned int policy;
 	int nr_cpus_allowed;
 	cpumask_t cpus_allowed;
+	cpumask_t cpus_requested;
 
 #ifdef CONFIG_PREEMPT_RCU
 	int rcu_read_lock_nesting;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f3e6608313a2..a0bf3a7ce550 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -852,6 +852,20 @@ void rebuild_sched_domains(void)
 	put_online_cpus();
 }
 
+static int update_cpus_allowed(struct cpuset *cs, struct task_struct *p,
+			       const struct cpumask *new_mask)
+{
+	int ret;
+
+	if (cpumask_subset(&p->cpus_requested, cs->cpus_requested)) {
+		ret = set_cpus_allowed_ptr(p, &p->cpus_requested);
+		if (!ret)
+			return ret;
+	}
+
+	return set_cpus_allowed_ptr(p, new_mask);
+}
+
 /**
  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -867,7 +881,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
 
 	css_task_iter_start(&cs->css, &it);
 	while ((task = css_task_iter_next(&it)))
-		set_cpus_allowed_ptr(task, cs->effective_cpus);
+		update_cpus_allowed(cs, task, cs->effective_cpus);
 	css_task_iter_end(&it);
 }
 
@@ -1556,7 +1570,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 		 * can_attach beforehand should guarantee that this doesn't
 		 * fail.  TODO: have a better way to handle failure here
 		 */
-		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+		WARN_ON_ONCE(update_cpus_allowed(cs, task, cpus_attach));
 
 		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
 		cpuset_update_task_spread_flag(cs, task);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 18afb0fe2704..b33433586774 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4921,6 +4921,9 @@ again:
 		retval = -EINVAL;
 	}
 
+	if (!retval && !(p->flags & PF_KTHREAD))
+		cpumask_and(&p->cpus_requested, in_mask, cpu_possible_mask);
+
 out_free_new_mask:
 	free_cpumask_var(new_mask);
 out_free_cpus_allowed:
@@ -8344,6 +8347,7 @@ void __init sched_init_smp(void)
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
 		BUG();
+	cpumask_copy(&current->cpus_requested, cpu_possible_mask);
 	sched_init_granularity();
 	free_cpumask_var(non_isolated_cpus);
 
-- 
cgit v1.2.3


From 12af218146a6c1f3b2b3cec48b14076131f8ecdb Mon Sep 17 00:00:00 2001
From: Adrian Salido <salidoa@google.com>
Date: Tue, 16 May 2017 20:08:29 -0700
Subject: msm: mdss: add idle state node

Add a helper node that can be used to notify user space through sysfs
node when fb device has not had any activity for a specified amount of
time (through idle_time node).

Bug: 62110101
Change-Id: I4dfa4b1a376149aa55a940dad7ac336ec99f1af8
Signed-off-by: Adrian Salido <salidoa@google.com>
---
 drivers/video/fbdev/msm/mdss_fb.c | 45 +++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/drivers/video/fbdev/msm/mdss_fb.c b/drivers/video/fbdev/msm/mdss_fb.c
index 530da7af1866..64f86084b01d 100644
--- a/drivers/video/fbdev/msm/mdss_fb.c
+++ b/drivers/video/fbdev/msm/mdss_fb.c
@@ -520,12 +520,17 @@ static void __mdss_fb_idle_notify_work(struct work_struct *work)
 
 	/* Notify idle-ness here */
 	pr_debug("Idle timeout %dms expired!\n", mfd->idle_time);
-	if (mfd->idle_time)
-		sysfs_notify(&mfd->fbi->dev->kobj, NULL, "idle_notify");
+
 	mfd->idle_state = MDSS_FB_IDLE;
+	/*
+	 * idle_notify node events are used to reduce MDP load when idle,
+	 * this is not needed for command mode panels.
+	 */
+	if (mfd->idle_time && mfd->panel.type != MIPI_CMD_PANEL)
+		sysfs_notify(&mfd->fbi->dev->kobj, NULL, "idle_notify");
+	sysfs_notify(&mfd->fbi->dev->kobj, NULL, "idle_state");
 }
 
-
 static ssize_t mdss_fb_get_fps_info(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -586,6 +591,26 @@ static ssize_t mdss_fb_get_idle_notify(struct device *dev,
 	return ret;
 }
 
+static ssize_t mdss_fb_get_idle_state(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct fb_info *fbi = dev_get_drvdata(dev);
+	struct msm_fb_data_type *mfd = fbi->par;
+	const char *state_strs[] = {
+		[MDSS_FB_NOT_IDLE] = "active",
+		[MDSS_FB_IDLE_TIMER_RUNNING] = "pending",
+		[MDSS_FB_IDLE] = "idle",
+	};
+	int state = mfd->idle_state;
+	const char *s;
+	if (state < ARRAY_SIZE(state_strs) && state_strs[state])
+		s = state_strs[state];
+	else
+		s = "invalid";
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", s);
+}
+
 static ssize_t mdss_fb_get_panel_info(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
@@ -922,6 +947,7 @@ static DEVICE_ATTR(show_blank_event, S_IRUGO, mdss_mdp_show_blank_event, NULL);
 static DEVICE_ATTR(idle_time, S_IRUGO | S_IWUSR | S_IWGRP,
 	mdss_fb_get_idle_time, mdss_fb_set_idle_time);
 static DEVICE_ATTR(idle_notify, S_IRUGO, mdss_fb_get_idle_notify, NULL);
+static DEVICE_ATTR(idle_state, S_IRUGO, mdss_fb_get_idle_state, NULL);
 static DEVICE_ATTR(msm_fb_panel_info, S_IRUGO, mdss_fb_get_panel_info, NULL);
 static DEVICE_ATTR(msm_fb_src_split_info, S_IRUGO, mdss_fb_get_src_split_info,
 	NULL);
@@ -943,6 +969,7 @@ static struct attribute *mdss_fb_attrs[] = {
 	&dev_attr_show_blank_event.attr,
 	&dev_attr_idle_time.attr,
 	&dev_attr_idle_notify.attr,
+	&dev_attr_idle_state.attr,
 	&dev_attr_msm_fb_panel_info.attr,
 	&dev_attr_msm_fb_src_split_info.attr,
 	&dev_attr_msm_fb_thermal_level.attr,
@@ -3136,14 +3163,18 @@ static int __mdss_fb_sync_buf_done_callback(struct notifier_block *p,
 			ret = __mdss_fb_wait_for_fence_sub(sync_pt_data,
 				sync_pt_data->temp_fen, fence_cnt);
 		}
-		if (mfd->idle_time && !mod_delayed_work(system_wq,
+		if (mfd->idle_time) {
+			if (!mod_delayed_work(system_wq,
 					&mfd->idle_notify_work,
 					msecs_to_jiffies(mfd->idle_time)))
-			pr_debug("fb%d: restarted idle work\n",
-					mfd->index);
+				pr_debug("fb%d: restarted idle work\n",
+						mfd->index);
+			mfd->idle_state = MDSS_FB_IDLE_TIMER_RUNNING;
+		} else {
+			mfd->idle_state = MDSS_FB_IDLE;
+		}
 		if (ret == -ETIME)
 			ret = NOTIFY_BAD;
-		mfd->idle_state = MDSS_FB_IDLE_TIMER_RUNNING;
 		break;
 	case MDP_NOTIFY_FRAME_FLUSHED:
 		pr_debug("%s: frame flushed\n", sync_pt_data->fence_name);
-- 
cgit v1.2.3


From 620df03a7229bd2e13cdd32c3b56babc5b40b797 Mon Sep 17 00:00:00 2001
From: Georg Veichtlbauer <georg@vware.at>
Date: Mon, 26 Jun 2023 13:33:34 +0200
Subject: msm: mdss: Treat polling_en as the bool that it is

Change-Id: Ifaa68915b52a0d6b54a5f80576ae65ba527a6c16
---
 drivers/video/fbdev/msm/mdss_mdp_intf_video.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/video/fbdev/msm/mdss_mdp_intf_video.c b/drivers/video/fbdev/msm/mdss_mdp_intf_video.c
index 3761fa4af0eb..caa910db508c 100644
--- a/drivers/video/fbdev/msm/mdss_mdp_intf_video.c
+++ b/drivers/video/fbdev/msm/mdss_mdp_intf_video.c
@@ -1236,7 +1236,7 @@ static int mdss_mdp_video_wait4comp(struct mdss_mdp_ctl *ctl, void *arg)
 		if (rc == 0) {
 			pr_warn("vsync wait timeout %d, fallback to poll mode\n",
 					ctl->num);
-			ctx->polling_en++;
+			ctx->polling_en = true;
 			rc = mdss_mdp_video_pollwait(ctl);
 		} else {
 			rc = 0;
-- 
cgit v1.2.3


From 12d40f1995b47ccbb29081c07591d1521d872e96 Mon Sep 17 00:00:00 2001
From: Georg Veichtlbauer <georg@vware.at>
Date: Mon, 26 Jun 2023 13:45:53 +0200
Subject: msm: mdss: Fix indentation

Change-Id: I930b89ba4d4312bd830e396ec6f6b62a0516f725
---
 drivers/video/fbdev/msm/msm_mdss_io_8974.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/video/fbdev/msm/msm_mdss_io_8974.c b/drivers/video/fbdev/msm/msm_mdss_io_8974.c
index 922c4440ba82..000beebe0375 100644
--- a/drivers/video/fbdev/msm/msm_mdss_io_8974.c
+++ b/drivers/video/fbdev/msm/msm_mdss_io_8974.c
@@ -1321,16 +1321,16 @@ static void mdss_dsi_phy_regulator_ctrl(struct mdss_dsi_ctrl_pdata *ctrl,
 				mdss_dsi_20nm_phy_regulator_enable(ctrl);
 				break;
 			default:
-			/*
-			 * For dual dsi case, do not reconfigure dsi phy
-			 * regulator if the other dsi controller is still
-			 * active.
-			 */
-			if (!mdss_dsi_is_hw_config_dual(sdata) ||
-				(other_ctrl && (!other_ctrl->is_phyreg_enabled
-						|| other_ctrl->mmss_clamp)))
-				mdss_dsi_28nm_phy_regulator_enable(ctrl);
-				break;
+				/*
+				 * For dual dsi case, do not reconfigure dsi phy
+				 * regulator if the other dsi controller is still
+				 * active.
+				 */
+				if (!mdss_dsi_is_hw_config_dual(sdata) ||
+					(other_ctrl && (!other_ctrl->is_phyreg_enabled
+							|| other_ctrl->mmss_clamp)))
+					mdss_dsi_28nm_phy_regulator_enable(ctrl);
+					break;
 			}
 		}
 		ctrl->is_phyreg_enabled = 1;
-- 
cgit v1.2.3


From cbe0b37059c9c491c48b0d014f2ceb0f2e9e2e96 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linux.alibaba.com>
Date: Thu, 7 Jun 2018 17:05:28 -0700
Subject: mm: introduce arg_lock to protect arg_start|end and  env_start|end in
 mm_struct

mmap_sem is on the hot path of kernel, and it very contended, but it is
abused too.  It is used to protect arg_start|end and evn_start|end when
reading /proc/$PID/cmdline and /proc/$PID/environ, but it doesn't make
sense since those proc files just expect to read 4 values atomically and
not related to VM, they could be set to arbitrary values by C/R.

And, the mmap_sem contention may cause unexpected issue like below:

INFO: task ps:14018 blocked for more than 120 seconds.
       Tainted: G            E 4.9.79-009.ali3000.alios7.x86_64 #1
 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
message.
 ps              D    0 14018      1 0x00000004
 Call Trace:
   schedule+0x36/0x80
   rwsem_down_read_failed+0xf0/0x150
   call_rwsem_down_read_failed+0x18/0x30
   down_read+0x20/0x40
   proc_pid_cmdline_read+0xd9/0x4e0
   __vfs_read+0x37/0x150
   vfs_read+0x96/0x130
   SyS_read+0x55/0xc0
   entry_SYSCALL_64_fastpath+0x1a/0xc5

Both Alexey Dobriyan and Michal Hocko suggested to use dedicated lock
for them to mitigate the abuse of mmap_sem.

So, introduce a new spinlock in mm_struct to protect the concurrent
access to arg_start|end, env_start|end and others, as well as replace
write map_sem to read to protect the race condition between prctl and
sys_brk which might break check_data_rlimit(), and makes prctl more
friendly to other VM operations.

This patch just eliminates the abuse of mmap_sem, but it can't resolve
the above hung task warning completely since the later
access_remote_vm() call needs acquire mmap_sem.  The mmap_sem
scalability issue will be solved in the future.

Change-Id: Ifa8f001ee2fc4f0ce60c18e771cebcf8a1f0943e
[yang.shi@linux.alibaba.com: add comment about mmap_sem and arg_lock]
  Link: http://lkml.kernel.org/r/1524077799-80690-1-git-send-email-yang.shi@linux.alibaba.com
Link: http://lkml.kernel.org/r/1523730291-109696-1-git-send-email-yang.shi@linux.alibaba.com
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mateusz Guzik <mguzik@redhat.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Git-commit: 88aa7cc688d48ddd84558b41d5905a0db9535c4b
Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
Signed-off-by: Srinivas Ramana <sramana@codeaurora.org>
---
 fs/proc/base.c           |  8 ++++----
 include/linux/mm_types.h |  2 ++
 kernel/fork.c            |  1 +
 kernel/sys.c             | 10 ++++++++--
 mm/init-mm.c             |  1 +
 5 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index e16afc80f810..c7402cb76f11 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -233,12 +233,12 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
 		goto out_mmput;
 	}
 
-	down_read(&mm->mmap_sem);
+	spin_lock(&mm->arg_lock);
 	arg_start = mm->arg_start;
 	arg_end = mm->arg_end;
 	env_start = mm->env_start;
 	env_end = mm->env_end;
-	up_read(&mm->mmap_sem);
+	spin_unlock(&mm->arg_lock);
 
 	BUG_ON(arg_start > arg_end);
 	BUG_ON(env_start > env_end);
@@ -990,10 +990,10 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 	if (!atomic_inc_not_zero(&mm->mm_users))
 		goto free;
 
-	down_read(&mm->mmap_sem);
+	spin_lock(&mm->arg_lock);
 	env_start = mm->env_start;
 	env_end = mm->env_end;
-	up_read(&mm->mmap_sem);
+	spin_unlock(&mm->arg_lock);
 
 	while (count > 0) {
 		size_t this_len, max_len;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 29c17fae9bbf..1019e8d3c88f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -442,6 +442,8 @@ struct mm_struct {
 	unsigned long exec_vm;		/* VM_EXEC & ~VM_WRITE */
 	unsigned long stack_vm;		/* VM_GROWSUP/DOWN */
 	unsigned long def_flags;
+
+	spinlock_t arg_lock; /* protect the below fields */
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long start_brk, brk, start_stack;
 	unsigned long arg_start, arg_end, env_start, env_end;
diff --git a/kernel/fork.c b/kernel/fork.c
index a21adc0155b9..c25ebf6dd7f9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -621,6 +621,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm->pinned_vm = 0;
 	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
 	spin_lock_init(&mm->page_table_lock);
+	spin_lock_init(&mm->arg_lock);
 	mm_init_cpumask(mm);
 	mm_init_aio(mm);
 	mm_init_owner(mm, p);
diff --git a/kernel/sys.c b/kernel/sys.c
index d5ea3360038c..25cf2aa72d3b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1854,7 +1854,11 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
 			return error;
 	}
 
-	down_write(&mm->mmap_sem);
+	/*
+	 * arg_lock protects concurent updates but we still need mmap_sem for
+	 * read to exclude races with sys_brk.
+	 */
+	down_read(&mm->mmap_sem);
 
 	/*
 	 * We don't validate if these members are pointing to
@@ -1868,6 +1872,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
 	 *    to any problem in kernel itself
 	 */
 
+	spin_lock(&mm->arg_lock);
 	mm->start_code	= prctl_map.start_code;
 	mm->end_code	= prctl_map.end_code;
 	mm->start_data	= prctl_map.start_data;
@@ -1879,6 +1884,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
 	mm->arg_end	= prctl_map.arg_end;
 	mm->env_start	= prctl_map.env_start;
 	mm->env_end	= prctl_map.env_end;
+	spin_unlock(&mm->arg_lock);
 
 	/*
 	 * Note this update of @saved_auxv is lockless thus
@@ -1891,7 +1897,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
 	if (prctl_map.auxv_size)
 		memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
 
-	up_write(&mm->mmap_sem);
+	up_read(&mm->mmap_sem);
 	return 0;
 }
 #endif /* CONFIG_CHECKPOINT_RESTORE */
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 975e49f00f34..02a6962d5b0b 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -21,6 +21,7 @@ struct mm_struct init_mm = {
 	.mm_count	= ATOMIC_INIT(1),
 	.mmap_sem	= __RWSEM_INITIALIZER(init_mm.mmap_sem),
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
 	.user_ns	= &init_user_ns,
 	INIT_MM_CONTEXT(init_mm)
-- 
cgit v1.2.3


From ebf270d24640abda4ddc8061e615facdb9b074d0 Mon Sep 17 00:00:00 2001
From: John Dias <joaodias@google.com>
Date: Mon, 6 Aug 2018 14:08:03 -0700
Subject: sched/fair: vruntime should normalize when switching from  fair

When rt_mutex_setprio changes a task's scheduling class to RT,
we're seeing cases where the task's vruntime is not updated
correctly upon return to the fair class.
Specifically, the following is being observed:
- task is deactivated while still in the fair class
- task is boosted to RT via rt_mutex_setprio, which changes
  the task to RT and calls check_class_changed.
- check_class_changed leads to detach_task_cfs_rq, at which point
  the vruntime_normalized check sees that the task's state is TASK_WAKING,
  which results in skipping the subtraction of the rq's min_vruntime
  from the task's vruntime
- later, when the prio is deboosted and the task is moved back
  to the fair class, the fair rq's min_vruntime is added to
  the task's vruntime, even though it wasn't subtracted earlier.
The immediate result is inflation of the task's vruntime, giving
it lower priority (starving it if there's enough available work).
The longer-term effect is inflation of all vruntimes because the
task's vruntime becomes the rq's min_vruntime when the higher
priority tasks go idle. That leads to a vicious cycle, where
the vruntime inflation repeatedly doubled.

The change here is to detect when vruntime_normalized is being
called when the task is waking but is waking in another class,
and to conclude that this is a case where vruntime has not
been normalized.

Bug: 80502612
Change-Id: If0bb02eb16939ca5e91ef282b7f9119ff68622c4
Signed-off-by: John Dias <joaodias@google.com>
---
 kernel/sched/fair.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 51443a801af5..3c0a8050b77d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11772,7 +11772,8 @@ static inline bool vruntime_normalized(struct task_struct *p)
 	 * - A task which has been woken up by try_to_wake_up() and
 	 *   waiting for actually being woken up by sched_ttwu_pending().
 	 */
-	if (!se->sum_exec_runtime || p->state == TASK_WAKING)
+	if (!se->sum_exec_runtime ||
+	    (p->state == TASK_WAKING && p->sched_class == &fair_sched_class))
 		return true;
 
 	return false;
-- 
cgit v1.2.3


From 90dccbae4c0410e209049e22eaae9cd2718e1aa5 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 5 Mar 2019 15:43:06 -0800
Subject: UPSTREAM: mm: reuse only-pte-mapped KSM page in do_wp_page()

Add an optimization for KSM pages almost in the same way that we have
for ordinary anonymous pages.  If there is a write fault in a page,
which is mapped to an only pte, and it is not related to swap cache; the
page may be reused without copying its content.

[ Note that we do not consider PageSwapCache() pages at least for now,
  since we don't want to complicate __get_ksm_page(), which has nice
  optimization based on this (for the migration case). Currenly it is
  spinning on PageSwapCache() pages, waiting for when they have
  unfreezed counters (i.e., for the migration finish). But we don't want
  to make it also spinning on swap cache pages, which we try to reuse,
  since there is not a very high probability to reuse them. So, for now
  we do not consider PageSwapCache() pages at all. ]

So in reuse_ksm_page() we check for 1) PageSwapCache() and 2)
page_stable_node(), to skip a page, which KSM is currently trying to
link to stable tree.  Then we do page_ref_freeze() to prohibit KSM to
merge one more page into the page, we are reusing.  After that, nobody
can refer to the reusing page: KSM skips !PageSwapCache() pages with
zero refcount; and the protection against of all other participants is
the same as for reused ordinary anon pages pte lock, page lock and
mmap_sem.

[akpm@linux-foundation.org: replace BUG_ON()s with WARN_ON()s]
Link: http://lkml.kernel.org/r/154471491016.31352.1168978849911555609.stgit@localhost.localdomain
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Reviewed-by: Yang Shi <yang.shi@linux.alibaba.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Change-Id: If32387b1f7c36f0e12fcbb0926bf1b67886ec594
---
 include/linux/ksm.h |  7 +++++++
 mm/ksm.c            | 30 ++++++++++++++++++++++++++++--
 mm/memory.c         | 17 +++++++++++++++--
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 481c8c4627ca..febba394f93c 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -62,6 +62,8 @@ struct page *ksm_might_need_to_copy(struct page *page,
 
 int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
 void ksm_migrate_page(struct page *newpage, struct page *oldpage);
+bool reuse_ksm_page(struct page *page,
+			struct vm_area_struct *vma, unsigned long address);
 
 #else  /* !CONFIG_KSM */
 
@@ -102,6 +104,11 @@ static inline int rmap_walk_ksm(struct page *page,
 static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
 {
 }
+static inline bool reuse_ksm_page(struct page *page,
+			struct vm_area_struct *vma, unsigned long address)
+{
+	return false;
+}
 #endif /* CONFIG_MMU */
 #endif /* !CONFIG_KSM */
 
diff --git a/mm/ksm.c b/mm/ksm.c
index 2a4ef426b331..bfee36c149e6 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -580,8 +580,9 @@ again:
 	 * case this node is no longer referenced, and should be freed;
 	 * however, it might mean that the page is under page_freeze_refs().
 	 * The __remove_mapping() case is easy, again the node is now stale;
-	 * but if page is swapcache in migrate_page_move_mapping(), it might
-	 * still be our page, in which case it's essential to keep the node.
+	 * the same is in reuse_ksm_page() case; but if page is swapcache
+	 * in migrate_page_move_mapping(), it might still be our page,
+	 * in which case it's essential to keep the node.
 	 */
 	while (!get_page_unless_zero(page)) {
 		/*
@@ -2061,6 +2062,31 @@ out:
 	return ret;
 }
 
+bool reuse_ksm_page(struct page *page,
+		    struct vm_area_struct *vma,
+		    unsigned long address)
+{
+#ifdef CONFIG_DEBUG_VM
+	if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
+			WARN_ON(!page_mapped(page)) ||
+			WARN_ON(!PageLocked(page))) {
+		dump_page(page, "reuse_ksm_page");
+		return false;
+	}
+#endif
+
+	if (PageSwapCache(page) || !page_stable_node(page))
+		return false;
+	/* Prohibit parallel get_ksm_page() */
+	if (!page_ref_freeze(page, 1))
+		return false;
+
+	page_move_anon_rmap(page, vma);
+	page->index = linear_page_index(vma, address);
+	page_ref_unfreeze(page, 1);
+
+	return true;
+}
 #ifdef CONFIG_MIGRATION
 void ksm_migrate_page(struct page *newpage, struct page *oldpage)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 09a57fe6ae01..ccb04d3f9bab 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2399,7 +2399,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * Take out anonymous pages first, anonymous shared vmas are
 	 * not dirty accountable.
 	 */
-	if (PageAnon(old_page) && !PageKsm(old_page)) {
+	if (PageAnon(old_page)) {
+		if (PageKsm(old_page) && (PageSwapCache(old_page) ||
+					  page_count(old_page) != 1))
+			goto copy;
 		if (!trylock_page(old_page)) {
 			page_cache_get(old_page);
 			pte_unmap_unlock(page_table, ptl);
@@ -2414,6 +2417,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			}
 			page_cache_release(old_page);
 		}
+		if (PageKsm(old_page)) {
+			bool reused = reuse_ksm_page(old_page, vma,
+						     address);
+			unlock_page(old_page);
+			if (!reused)
+				goto copy;
+			wp_page_reuse(mm, vma, address, page_table, ptl,
+				      orig_pte, old_page, 0, 0);
+			return VM_FAULT_WRITE;
+		}
 		if (reuse_swap_page(old_page)) {
 			/*
 			 * The page is all ours.  Move it to our anon_vma so
@@ -2431,7 +2444,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		return wp_page_shared(mm, vma, address, page_table, pmd,
 				      ptl, orig_pte, old_page);
 	}
-
+copy:
 	/*
 	 * Ok, we need to copy. Oh, well..
 	 */
-- 
cgit v1.2.3


From 46c6fbdd185a7005fac895c5cfa7896b9329324f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 21 Aug 2020 19:49:55 -0400
Subject: BACKPORT: mm: do_wp_page() simplification

How about we just make sure we're the only possible valid user fo the
page before we bother to reuse it?

Simplify, simplify, simplify.

And get rid of the nasty serialization on the page lock at the same time.

[peterx: add subject prefix]

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Change-Id: I25c2cf9a3afebc19aaeda29ffbfcb82edb7fcb7c
---
 mm/memory.c | 49 ++++++++++++++-----------------------------------
 1 file changed, 14 insertions(+), 35 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index ccb04d3f9bab..8e44f5d2d5f4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2400,45 +2400,24 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * not dirty accountable.
 	 */
 	if (PageAnon(old_page)) {
-		if (PageKsm(old_page) && (PageSwapCache(old_page) ||
-					  page_count(old_page) != 1))
+		/* PageKsm() doesn't necessarily raise the page refcount */
+		if (PageKsm(old_page) || page_count(old_page) != 1)
 			goto copy;
-		if (!trylock_page(old_page)) {
-			page_cache_get(old_page);
-			pte_unmap_unlock(page_table, ptl);
-			lock_page(old_page);
-			page_table = pte_offset_map_lock(mm, pmd, address,
-							 &ptl);
-			if (!pte_same(*page_table, orig_pte)) {
-				unlock_page(old_page);
-				pte_unmap_unlock(page_table, ptl);
-				page_cache_release(old_page);
-				return 0;
-			}
-			page_cache_release(old_page);
-		}
-		if (PageKsm(old_page)) {
-			bool reused = reuse_ksm_page(old_page, vma,
-						     address);
-			unlock_page(old_page);
-			if (!reused)
-				goto copy;
-			wp_page_reuse(mm, vma, address, page_table, ptl,
-				      orig_pte, old_page, 0, 0);
-			return VM_FAULT_WRITE;
-		}
-		if (reuse_swap_page(old_page)) {
-			/*
-			 * The page is all ours.  Move it to our anon_vma so
-			 * the rmap code will not search our parent or siblings.
-			 * Protected against the rmap code by the page lock.
-			 */
-			page_move_anon_rmap(old_page, vma, address);
+		if (!trylock_page(old_page))
+			goto copy;
+		if (PageKsm(old_page) || page_mapcount(old_page) != 1 || page_count(old_page) != 1) {
 			unlock_page(old_page);
-			return wp_page_reuse(mm, vma, address, page_table, ptl,
-					     orig_pte, old_page, 0, 0);
+			goto copy;
 		}
+		/*
+		 * Ok, we've got the only map reference, and the only
+		 * page count reference, and the page is locked,
+		 * it's dark out, and we're wearing sunglasses. Hit it.
+		 */
+		wp_page_reuse(mm, vma, address, page_table, ptl,
+			      orig_pte, old_page, 0, 0);
 		unlock_page(old_page);
+		return VM_FAULT_WRITE;
 	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
 					(VM_WRITE|VM_SHARED))) {
 		return wp_page_shared(mm, vma, address, page_table, pmd,
-- 
cgit v1.2.3


From 899def5edcd48ff16c0f41624ecf8be77c81a18d Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Fri, 21 Aug 2020 19:49:56 -0400
Subject: UPSTREAM: mm/ksm: Remove reuse_ksm_page()

Remove the function as the last reference has gone away with the do_wp_page()
changes.

Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Change-Id: Ie4da88791abe9407157566854b2db9b94c0c962f
---
 include/linux/ksm.h |  7 -------
 mm/ksm.c            | 25 -------------------------
 2 files changed, 32 deletions(-)

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index febba394f93c..481c8c4627ca 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -62,8 +62,6 @@ struct page *ksm_might_need_to_copy(struct page *page,
 
 int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
 void ksm_migrate_page(struct page *newpage, struct page *oldpage);
-bool reuse_ksm_page(struct page *page,
-			struct vm_area_struct *vma, unsigned long address);
 
 #else  /* !CONFIG_KSM */
 
@@ -104,11 +102,6 @@ static inline int rmap_walk_ksm(struct page *page,
 static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
 {
 }
-static inline bool reuse_ksm_page(struct page *page,
-			struct vm_area_struct *vma, unsigned long address)
-{
-	return false;
-}
 #endif /* CONFIG_MMU */
 #endif /* !CONFIG_KSM */
 
diff --git a/mm/ksm.c b/mm/ksm.c
index bfee36c149e6..98920d1d59ed 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2062,31 +2062,6 @@ out:
 	return ret;
 }
 
-bool reuse_ksm_page(struct page *page,
-		    struct vm_area_struct *vma,
-		    unsigned long address)
-{
-#ifdef CONFIG_DEBUG_VM
-	if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
-			WARN_ON(!page_mapped(page)) ||
-			WARN_ON(!PageLocked(page))) {
-		dump_page(page, "reuse_ksm_page");
-		return false;
-	}
-#endif
-
-	if (PageSwapCache(page) || !page_stable_node(page))
-		return false;
-	/* Prohibit parallel get_ksm_page() */
-	if (!page_ref_freeze(page, 1))
-		return false;
-
-	page_move_anon_rmap(page, vma);
-	page->index = linear_page_index(vma, address);
-	page_ref_unfreeze(page, 1);
-
-	return true;
-}
 #ifdef CONFIG_MIGRATION
 void ksm_migrate_page(struct page *newpage, struct page *oldpage)
 {
-- 
cgit v1.2.3


From 45df1516d04a155794e56e6ded1c4813a3e56048 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 24 Sep 2020 08:41:32 -0700
Subject: UPSTREAM: mm: fix misplaced unlock_page in do_wp_page()

Commit 09854ba94c6a ("mm: do_wp_page() simplification") reorganized all
the code around the page re-use vs copy, but in the process also moved
the final unlock_page() around to after the wp_page_reuse() call.

That normally doesn't matter - but it means that the unlock_page() is
now done after releasing the page table lock.  Again, not a big deal,
you'd think.

But it turns out that it's very wrong indeed, because once we've
released the page table lock, we've basically lost our only reference to
the page - the page tables - and it could now be free'd at any time.  We
do hold the mmap_sem, so no actual unmap() can happen, but madvise can
come in and a MADV_DONTNEED will zap the page range - and free the page.

So now the page may be free'd just as we're unlocking it, which in turn
will usually trigger a "Bad page state" error in the freeing path.  To
make matters more confusing, by the time the debug code prints out the
page state, the unlock has typically completed and everything looks fine
again.

This all doesn't happen in any normal situations, but it does trigger
with the dirtyc0w_child LTP test.  And it seems to trigger much more
easily (but not expclusively) on s390 than elsewhere, probably because
s390 doesn't do the "batch pages up for freeing after the TLB flush"
that gives the unlock_page() more time to complete and makes the race
harder to hit.

Fixes: 09854ba94c6a ("mm: do_wp_page() simplification")
Link: https://lore.kernel.org/lkml/a46e9bbef2ed4e17778f5615e818526ef848d791.camel@redhat.com/
Link: https://lore.kernel.org/linux-mm/c41149a8-211e-390b-af1d-d5eee690fecb@linux.alibaba.com/
Reported-by: Qian Cai <cai@redhat.com>
Reported-by: Alex Shi <alex.shi@linux.alibaba.com>
Bisected-and-analyzed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Tested-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Change-Id: Idae0eb8d7478ab61de567ca9e446df4cc4dfc2a0
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index 8e44f5d2d5f4..0c69908d3eed 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2414,9 +2414,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * page count reference, and the page is locked,
 		 * it's dark out, and we're wearing sunglasses. Hit it.
 		 */
+		unlock_page(old_page);
 		wp_page_reuse(mm, vma, address, page_table, ptl,
 			      orig_pte, old_page, 0, 0);
-		unlock_page(old_page);
 		return VM_FAULT_WRITE;
 	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
 					(VM_WRITE|VM_SHARED))) {
-- 
cgit v1.2.3


From c0b317c27d445025c40d2f3af1a052115e027e5e Mon Sep 17 00:00:00 2001
From: Tengfei Fan <tengfeif@codeaurora.org>
Date: Mon, 19 Nov 2018 13:45:29 +0800
Subject: pinctrl: qcom: Clear status bit on irq_unmask

The gpio interrupt status bit is getting set after the
irq is disabled and causing an immediate interrupt after
enablling the irq, so clear status bit on irq_unmask.

Change-Id: I89245b90b06b37671369e59c15fb24a991cc114a
Signed-off-by: Tengfei Fan <tengfeif@codeaurora.org>
---
 drivers/pinctrl/qcom/pinctrl-msm.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c
index d4a1f5378ac5..ee8c09717597 100644
--- a/drivers/pinctrl/qcom/pinctrl-msm.c
+++ b/drivers/pinctrl/qcom/pinctrl-msm.c
@@ -631,6 +631,7 @@ static void msm_gpio_irq_enable(struct irq_data *d)
 static void msm_gpio_irq_unmask(struct irq_data *d)
 {
 	struct gpio_chip *gc = irq_data_get_irq_chip_data(d);
+	uint32_t irqtype = irqd_get_trigger_type(d);
 	struct msm_pinctrl *pctrl = to_msm_pinctrl(gc);
 	const struct msm_pingroup *g;
 	unsigned long flags;
@@ -640,6 +641,12 @@ static void msm_gpio_irq_unmask(struct irq_data *d)
 
 	spin_lock_irqsave(&pctrl->lock, flags);
 
+	if (irqtype & (IRQF_TRIGGER_HIGH | IRQF_TRIGGER_LOW)) {
+		val = readl_relaxed(pctrl->regs + g->intr_status_reg);
+		val &= ~BIT(g->intr_status_bit);
+		writel_relaxed(val, pctrl->regs + g->intr_status_reg);
+	}
+
 	val = readl(pctrl->regs + g->intr_status_reg);
 	val &= ~BIT(g->intr_status_bit);
 	writel(val, pctrl->regs + g->intr_status_reg);
-- 
cgit v1.2.3


From a9314f9d8ad402f17e107f2f4a11636e50301cfa Mon Sep 17 00:00:00 2001
From: Maria Yu <aiquny@codeaurora.org>
Date: Mon, 15 Apr 2019 12:41:12 +0800
Subject: sched/fair: Allow load bigger task load balance when nr_running is 2

When there is only 2 tasks in 1 cpu and the other
task is currently running, allow load bigger task
to be balanced if the other task is currently
running.

Change-Id: I489e9624ba010f9293272a67585e8209a786b787
Signed-off-by: Maria Yu <aiquny@codeaurora.org>
---
 kernel/sched/fair.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3c0a8050b77d..941424604fdd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8953,7 +8953,17 @@ redo:
 		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
 			goto next;
 
-		if ((load / 2) > env->imbalance)
+		/*
+		 * p is not running task when we goes until here, so if p is one
+		 * of the 2 task in src cpu rq and not the running one,
+		 * that means it is the only task that can be balanced.
+		 * So only when there is other tasks can be balanced or
+		 * there is situation to ignore big task, it is needed
+		 * to skip the task load bigger than 2*imbalance.
+		 */
+		if (((cpu_rq(env->src_cpu)->nr_running > 2) ||
+			(env->flags & LBF_IGNORE_BIG_TASKS)) &&
+			((load / 2) > env->imbalance))
 			goto next;
 
 		detach_task(p, env);
-- 
cgit v1.2.3


From b9b6bc6ea3c06ab2edac96db7a3f9e51c9e459d1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 17 Jun 2017 08:10:08 -0400
Subject: sched: Allow migrating kthreads into online but inactive CPUs

Per-cpu workqueues have been tripping CPU affinity sanity checks while
a CPU is being offlined.  A per-cpu kworker ends up running on a CPU
which isn't its target CPU while the CPU is online but inactive.

While the scheduler allows kthreads to wake up on an online but
inactive CPU, it doesn't allow a running kthread to be migrated to
such a CPU, which leads to an odd situation where setting affinity on
a sleeping and running kthread leads to different results.

Each mem-reclaim workqueue has one rescuer which guarantees forward
progress and the rescuer needs to bind itself to the CPU which needs
help in making forward progress; however, due to the above issue,
while set_cpus_allowed_ptr() succeeds, the rescuer doesn't end up on
the correct CPU if the CPU is in the process of going offline,
tripping the sanity check and executing the work item on the wrong
CPU.

This patch updates __migrate_task() so that kthreads can be migrated
into an inactive but online CPU.

Change-Id: I38cc3eb3b2ec5b7034cc72a2bcdd32a549314915
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/sched/core.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b33433586774..3c64cd08e8e2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1144,8 +1144,13 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
 {
 	int src_cpu;
 
-	if (unlikely(!cpu_active(dest_cpu)))
-		return rq;
+	if (p->flags & PF_KTHREAD) {
+		if (unlikely(!cpu_online(dest_cpu)))
+			return ret;
+	} else {
+		if (unlikely(!cpu_active(dest_cpu)))
+			return ret;
+	}
 
 	/* Affinity changed (again). */
 	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
-- 
cgit v1.2.3


From c9999f04236e640bc82b2085cadca8487184bc76 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 16 Jun 2016 15:35:04 -0400
Subject: sched/core: Allow kthreads to fall back to online && !active cpus

During CPU hotplug, CPU_ONLINE callbacks are run while the CPU is
online but not active.  A CPU_ONLINE callback may create or bind a
kthread so that its cpus_allowed mask only allows the CPU which is
being brought online.  The kthread may start executing before the CPU
is made active and can end up in select_fallback_rq().

In such cases, the expected behavior is selecting the CPU which is
coming online; however, because select_fallback_rq() only chooses from
active CPUs, it determines that the task doesn't have any viable CPU
in its allowed mask and ends up overriding it to cpu_possible_mask.

CPU_ONLINE callbacks should be able to put kthreads on the CPU which
is coming online.  Update select_fallback_rq() so that it follows
cpu_online() rather than cpu_active() for kthreads.

Reported-by: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Tested-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Change-Id: I562dcc53717b1f2f8324abffb652b91592ba8d5c
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Abdul Haleem <abdhalee@linux.vnet.ibm.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team@fb.com
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/20160616193504.GB3262@mtj.duckdns.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3c64cd08e8e2..e83959592d6c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1654,9 +1654,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso)
 	for (;;) {
 		/* Any allowed, online CPU? */
 		for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-			if (!cpu_online(dest_cpu))
+			if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
 				continue;
-			if (!cpu_active(dest_cpu))
+			if (!cpu_online(dest_cpu))
 				continue;
 			if (cpu_isolated(dest_cpu)) {
 				if (allow_iso)
-- 
cgit v1.2.3


From 0ffdb886996b66e1eb9bbda6e9605bf7dbca0cd1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 25 Jul 2017 18:58:21 +0200
Subject: BACKPORT: sched/core: Fix rules for running on online && !active CPUs

As already enforced by the WARN() in __set_cpus_allowed_ptr(), the rules
for running on an online && !active CPU are stricter than just being a
kthread, you need to be a per-cpu kthread.

If you're not strictly per-CPU, you have better CPUs to run on and
don't need the partially booted one to get your work done.

The exception is to allow smpboot threads to bootstrap the CPU itself
and get kernel 'services' initialized before we allow userspace on it.

Change-Id: I515e873a6e5be0cde7771ecedf56101614300fe2
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 955dbdf4ce87 ("sched: Allow migrating kthreads into online but inactive CPUs")
Link: http://lkml.kernel.org/r/20170725165821.cejhb7v2s3kecems@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Backported to 4.4
Signed-off-by: joshuous <joshuous@gmail.com>
---
 kernel/sched/core.c | 41 +++++++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e83959592d6c..f73831486c2f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1085,6 +1085,33 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
+
+static inline bool is_per_cpu_kthread(struct task_struct *p)
+{
+	if (!(p->flags & PF_KTHREAD))
+		return false;
+
+	if (p->nr_cpus_allowed != 1)
+		return false;
+
+	return true;
+}
+
+/*
+ * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
+ * __set_cpus_allowed_ptr() and select_fallback_rq().
+ */
+static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
+{
+	if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+		return false;
+
+	if (is_per_cpu_kthread(p))
+		return cpu_online(cpu);
+
+	return cpu_active(cpu);
+}
+
 /*
  * This is how migration works:
  *
@@ -1144,16 +1171,8 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_
 {
 	int src_cpu;
 
-	if (p->flags & PF_KTHREAD) {
-		if (unlikely(!cpu_online(dest_cpu)))
-			return ret;
-	} else {
-		if (unlikely(!cpu_active(dest_cpu)))
-			return ret;
-	}
-
 	/* Affinity changed (again). */
-	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+	if (!is_cpu_allowed(p, dest_cpu))
 		return rq;
 
 	src_cpu = cpu_of(rq);
@@ -1654,9 +1673,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso)
 	for (;;) {
 		/* Any allowed, online CPU? */
 		for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-			if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
-				continue;
-			if (!cpu_online(dest_cpu))
+			if (!is_cpu_allowed(p, dest_cpu))
 				continue;
 			if (cpu_isolated(dest_cpu)) {
 				if (allow_iso)
-- 
cgit v1.2.3


From eccc8acbe705a20e0911ea776371d84eba53cc8e Mon Sep 17 00:00:00 2001
From: Maria Yu <aiquny@codeaurora.org>
Date: Fri, 26 Apr 2019 15:20:18 +0800
Subject: sched/fair: Avoid unnecessary active load balance

When find busiest group, it will avoid load balance if
it is only 1 task running on src cpu. Consider race when
different cpus do newly idle load balance at the same time,
check src cpu nr_running to avoid unnecessary active load
balance again.
See the race condition example here:
  1) cpu2 have 2 tasks, so cpu2 rq->nr_running == 2 and cfs.h_nr_running
      ==2.
  2) cpu4 and cpu5 doing newly idle load balance at the same time.
  3) cpu4 and cpu5 both see cpu2 sched_load_balance_sg_stats sum_nr_run=2
     so they are both see cpu2 as the busiest rq.
  4) cpu5 did a success migration task from cpu2, so cpu2 only have 1 task
     left, cpu2 rq->nr_running == 1 and cfs.h_nr_running ==1.
  5) cpu4 surely goes to no_move because currently cpu4 only have 1 task
     which is currently running.
  6) and then cpu4 goes here to check if cpu2 need active load balance.

Change-Id: Ia9539a43e9769c4936f06ecfcc11864984c50c29
Signed-off-by: Maria Yu <aiquny@codeaurora.org>
---
 kernel/sched/fair.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 941424604fdd..42f05c742846 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10456,8 +10456,10 @@ static int need_active_balance(struct lb_env *env)
 	 * It's worth migrating the task if the src_cpu's capacity is reduced
 	 * because of other sched_class or IRQs if more capacity stays
 	 * available on dst_cpu.
+	 * Avoid pulling the CFS task if it is the only task running.
 	 */
 	if ((env->idle != CPU_NOT_IDLE) &&
+	    (env->src_rq->nr_running > 1) &&
 	    (env->src_rq->cfs.h_nr_running == 1)) {
 		if ((check_cpu_capacity(env->src_rq, sd)) &&
 		    (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
-- 
cgit v1.2.3


From 8d8a48aecde5c4be6c57b9108dc22e8e0cd7f235 Mon Sep 17 00:00:00 2001
From: Jeffrey Hugo <jhugo@codeaurora.org>
Date: Fri, 19 May 2017 23:49:11 -0400
Subject: sched/fair: Fix load_balance() affinity redo path

If load_balance() fails to migrate any tasks because all tasks were
affined, load_balance() removes the source cpu from consideration and
attempts to redo and balance among the new subset of cpus.

There is a bug in this code path where the algorithm considers all active
cpus in the system (minus the source that was just masked out).  This is
not valid for two reasons: some active cpus may not be in the current
scheduling domain and one of the active cpus is dst_cpu. These cpus should
not be considered, as we cannot pull load from them.

Instead of failing out of load_balance(), we may end up redoing the search
with no valid cpus and incorrectly concluding the domain is balanced.
Additionally, if the group_imbalance flag was just set, it may also be
incorrectly unset, thus the flag will not be seen by other cpus in future
load_balance() runs as that algorithm intends.

Fix the check by removing cpus not in the current domain and the dst_cpu
from considertation, thus limiting the evaluation to valid remaining cpus
from which load might be migrated.

Co-authored-by: Austin Christ <austinwc@codeaurora.org>
Co-authored-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Jeffrey Hugo <jhugo@codeaurora.org>
Tested-by: Tyler Baicar <tbaicar@codeaurora.org>
Change-Id: Ife6701c9c62e7155493d9db9398f08c4474e94b3
---
 kernel/sched/fair.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 42f05c742846..a2f52c35c76a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10697,7 +10697,24 @@ more_balance:
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
 			cpumask_clear_cpu(cpu_of(busiest), cpus);
-			if (!cpumask_empty(cpus)) {
+			/*
+			 * dst_cpu is not a valid busiest cpu in the following
+			 * check since load cannot be pulled from dst_cpu to be
+			 * put on dst_cpu.
+			 */
+			cpumask_clear_cpu(env.dst_cpu, cpus);
+			/*
+			 * Go back to "redo" iff the load-balance cpumask
+			 * contains other potential busiest cpus for the
+			 * current sched domain.
+			 */
+			if (cpumask_intersects(cpus, sched_domain_span(env.sd))) {
+				/*
+				 * Now that the check has passed, reenable
+				 * dst_cpu so that load can be calculated on
+				 * it in the redo path.
+				 */
+				cpumask_set_cpu(env.dst_cpu, cpus);
 				env.loop = 0;
 				env.loop_break = sched_nr_migrate_break;
 				goto redo;
-- 
cgit v1.2.3


From 938e42ca699f3224fcb9687bf9feba3f4a1abf32 Mon Sep 17 00:00:00 2001
From: Maulik Shah <mkshah@codeaurora.org>
Date: Thu, 21 Jun 2018 11:48:23 +0530
Subject: drivers: cpuidle: lpm-levels: Correctly check for list empty

Correctly check for list empty condition to get least cluster
latency.

Change-Id: I6584a8d6d77794ca506c994d927467e9c1fefa63
Signed-off-by: Maulik Shah <mkshah@codeaurora.org>
---
 drivers/cpuidle/lpm-levels.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpuidle/lpm-levels.c b/drivers/cpuidle/lpm-levels.c
index 1eaef20e5ed5..f46126e41266 100644
--- a/drivers/cpuidle/lpm-levels.c
+++ b/drivers/cpuidle/lpm-levels.c
@@ -187,7 +187,7 @@ static uint32_t least_cluster_latency(struct lpm_cluster *cluster,
 	uint32_t latency = 0;
 	int i;
 
-	if (!cluster->list.next) {
+	if (list_empty(&cluster->list)) {
 		for (i = 0; i < cluster->nlevels; i++) {
 			level = &cluster->levels[i];
 			pwr_params = &level->pwr;
-- 
cgit v1.2.3


From c71b8fffe6b3d099b76c05f922fde8aa4b6c2334 Mon Sep 17 00:00:00 2001
From: Lina Iyer <ilina@codeaurora.org>
Date: Mon, 17 Jul 2017 11:50:25 -0600
Subject: drivers: cpuidle: lpm-levels: Fix KW issues with idle state idx < 0

Idle state calculcations will need to return the state chosen as an
integer. The state chosen is used as a index into the array and as such
cannot be negative value. Do not return negative errors from the
calculations. By default, the state returned wil be zero.

Change-Id: Idb18e933f385cf868fe99fa6a2783f6b8e84c196
Signed-off-by: Lina Iyer <ilina@codeaurora.org>
---
 drivers/cpuidle/lpm-levels.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/drivers/cpuidle/lpm-levels.c b/drivers/cpuidle/lpm-levels.c
index f46126e41266..dca59eadc6c2 100644
--- a/drivers/cpuidle/lpm-levels.c
+++ b/drivers/cpuidle/lpm-levels.c
@@ -691,7 +691,7 @@ static void update_history(struct cpuidle_device *dev, int idx);
 static int cpu_power_select(struct cpuidle_device *dev,
 		struct lpm_cpu *cpu)
 {
-	int best_level = -1;
+	int best_level = 0;
 	uint32_t latency_us = pm_qos_request_for_cpu(PM_QOS_CPU_DMA_LATENCY,
 							dev->cpu);
 	s64 sleep_us = ktime_to_us(tick_nohz_get_sleep_length());
@@ -705,8 +705,6 @@ static int cpu_power_select(struct cpuidle_device *dev,
 	uint32_t *min_residency = get_per_cpu_min_residency(dev->cpu);
 	uint32_t *max_residency = get_per_cpu_max_residency(dev->cpu);
 
-	if (!cpu)
-		return -EINVAL;
 
 	if ((sleep_disabled && !cpu_isolated(dev->cpu)) || sleep_us  < 0)
 		return 0;
@@ -1536,17 +1534,11 @@ static int lpm_cpuidle_select(struct cpuidle_driver *drv,
 		struct cpuidle_device *dev)
 {
 	struct lpm_cluster *cluster = per_cpu(cpu_cluster, dev->cpu);
-	int idx;
 
 	if (!cluster)
 		return 0;
 
-	idx = cpu_power_select(dev, cluster->cpu);
-
-	if (idx < 0)
-		return -EPERM;
-
-	return idx;
+	return cpu_power_select(dev, cluster->cpu);
 }
 
 static void update_history(struct cpuidle_device *dev, int idx)
@@ -1591,9 +1583,6 @@ static int lpm_cpuidle_enter(struct cpuidle_device *dev,
 	int64_t start_time = ktime_to_ns(ktime_get()), end_time;
 	struct power_params *pwr_params;
 
-	if (idx < 0)
-		return -EINVAL;
-
 	pwr_params = &cluster->cpu->levels[idx].pwr;
 	sched_set_cpu_cstate(smp_processor_id(), idx + 1,
 		pwr_params->energy_overhead, pwr_params->latency_us);
-- 
cgit v1.2.3


From 43cbf9d6153d820d68ba96ddb1a57dbdc7373c90 Mon Sep 17 00:00:00 2001
From: Pavankumar Kondeti <pkondeti@codeaurora.org>
Date: Wed, 27 Mar 2019 13:31:40 +0530
Subject: sched/tune: Increase the cgroup limit to 6

The schedtune cgroup controller allows upto 5 cgroups including the
default/root cgroup. Until now the user space is creating only
4 additional cgroups namely, foreground, background, top-app and
audio-app. Recently another cgroup called rt is created before
the audio-app cgroup. Since kernel limits the cgroups to 5, the
creation of audio-app cgroup is failing. Fix this by increasing
the schedtune cgroup controller cgroup limit to 6.

Change-Id: I13252a90dba9b8010324eda29b8901cb0b20bc21
Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
---
 kernel/sched/tune.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index b84d13750604..d8beda0d74b4 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -240,7 +240,7 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta,
  *    implementation especially for the computation of the per-CPU boost
  *    value
  */
-#define BOOSTGROUPS_COUNT 5
+#define BOOSTGROUPS_COUNT 6
 
 /* Array of configured boostgroups */
 static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
-- 
cgit v1.2.3


From 41cbb7bc59fb94f020839f20b890033e2f407ca3 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Thu, 1 Jun 2017 10:59:12 -0700
Subject: sched: walt: fix window misalignment when HZ=300

Due to rounding error hrtimer tick interval becomes 3333333 ns when HZ=300.
Consequently the tick time stamp nearest to the WALT's default window size
20ms will be also 19999998 (3333333 * 6).

[beykerykt]: Adapt for HMP
Change-Id: I08f9bd2dbecccbb683e4490d06d8b0da703d3ab2
Suggested-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 kernel/sched/hmp.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 598656b42203..f9fff7217eaa 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -765,7 +765,15 @@ unsigned int
 min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
 
 /* Min window size (in ns) = 10ms */
+#ifdef CONFIG_HZ_300
+/*
+ * Tick interval becomes to 3333333 due to
+ * rounding error when HZ=300.
+ */
+#define MIN_SCHED_RAVG_WINDOW (3333333 * 6)
+#else
 #define MIN_SCHED_RAVG_WINDOW 10000000
+#endif
 
 /* Max window size (in ns) = 1s */
 #define MAX_SCHED_RAVG_WINDOW 1000000000
-- 
cgit v1.2.3


From 0fa652ee00f5aaf9fdebea0e0f840e59bdb6795b Mon Sep 17 00:00:00 2001
From: Vikram Mulukutla <markivx@codeaurora.org>
Date: Thu, 10 Aug 2017 17:26:20 -0700
Subject: sched: walt: Correct WALT window size initialization

It is preferable that WALT window rollover occurs just
before a tick, since the tick is an opportune moment
to record a complete window's statistics, as well as report
those stats to the cpu frequency governor. When CONFIG_HZ
results in a TICK_NSEC that isn't a integral number, this
requirement may be violated. Account for this by reducing
the WALT window size to the nearest multiple of TICK_NSEC.

Commit d368c6faa19b ("sched: walt: fix window misalignment
when HZ=300") attempted to do this but WALT isn't using
MIN_SCHED_RAVG_WINDOW as the window size and the patch was
doing nothing.

Also, change the type of 'walt_disabled' to bool and warn
if an invalid window size causes WALT to be disabled.

[beykerykt]: Adapt for HMP
Change-Id: Ie3dcfc21a3df4408254ca1165a355bbe391ed5c7
Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
---
 kernel/sched/hmp.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index f9fff7217eaa..1755e919f8f4 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -765,20 +765,15 @@ unsigned int
 min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
 
 /* Min window size (in ns) = 10ms */
-#ifdef CONFIG_HZ_300
-/*
- * Tick interval becomes to 3333333 due to
- * rounding error when HZ=300.
- */
-#define MIN_SCHED_RAVG_WINDOW (3333333 * 6)
-#else
-#define MIN_SCHED_RAVG_WINDOW 10000000
-#endif
+#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC)
 
 /* Max window size (in ns) = 1s */
-#define MAX_SCHED_RAVG_WINDOW 1000000000
+#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC)
 
-/* Window size (in ns) */
+/*
+ * Window size (in ns). Adjust for the tick size so that the window
+ * rollover occurs just before the tick boundary.
+ */
 __read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW;
 
 /* Maximum allowed threshold before freq aggregation must be enabled */
@@ -1624,17 +1619,20 @@ static inline int exiting_task(struct task_struct *p)
 
 static int __init set_sched_ravg_window(char *str)
 {
+	unsigned int adj_window;
 	unsigned int window_size;
 
 	get_option(&str, &window_size);
 
-	if (window_size < MIN_SCHED_RAVG_WINDOW ||
-			window_size > MAX_SCHED_RAVG_WINDOW) {
-		WARN_ON(1);
-		return -EINVAL;
-	}
+	/* Adjust for CONFIG_HZ */
+	adj_window = (window_size / TICK_NSEC) * TICK_NSEC;
+
+	/* Warn if we're a bit too far away from the expected window size */
+	WARN(adj_window < window_size - NSEC_PER_MSEC,
+	     "tick-adjusted window size %u, original was %u\n", adj_window,
+	     window_size);
 
-	sched_ravg_window = window_size;
+	sched_ravg_window = adj_window;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 6adb092856e806d91f3fc22dff0ef36506dd0bae Mon Sep 17 00:00:00 2001
From: Vikram Mulukutla <markivx@codeaurora.org>
Date: Tue, 6 Jun 2017 11:58:27 -0700
Subject: sched: cpufreq: Limit governor updates to WALT changes alone

It's not necessary to keep reporting load to the governor
if it doesn't change in a window. Limit updates to when
we expect load changes - after window rollover and when
we send updates related to intercluster migrations.

[beykerykt]: Adapt for HMP
Change-Id: I3232d40f3d54b0b81cfafdcdb99b534df79327bf
Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
---
 include/linux/sched.h | 1 +
 kernel/sched/hmp.c    | 6 ++++--
 kernel/sched/sched.h  | 4 +++-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9cb6964d178e..0e8fff43cc17 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3724,6 +3724,7 @@ static inline unsigned long rlimit_max(unsigned int limit)
 #define SCHED_CPUFREQ_DL        (1U << 1)
 #define SCHED_CPUFREQ_IOWAIT    (1U << 2)
 #define SCHED_CPUFREQ_INTERCLUSTER_MIG (1U << 3)
+#define SCHED_CPUFREQ_WALT (1U << 4)
 
 #ifdef CONFIG_CPU_FREQ
 struct update_util_data {
diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 1755e919f8f4..6a403be2ae7c 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -3663,8 +3663,10 @@ void fixup_busy_time(struct task_struct *p, int new_cpu)
 	migrate_top_tasks(p, src_rq, dest_rq);
 
 	if (!same_freq_domain(new_cpu, task_cpu(p))) {
-		cpufreq_update_util(dest_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG);
-		cpufreq_update_util(src_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG);
+		cpufreq_update_util(dest_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG |
+					     SCHED_CPUFREQ_WALT);
+		cpufreq_update_util(src_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG |
+					    SCHED_CPUFREQ_WALT);
 	}
 
 	if (p == src_rq->ed_task) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 90cc450dff7e..40da1a509ded 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2856,8 +2856,10 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
 #ifdef CONFIG_SCHED_HMP
 	/*
 	 * Skip if we've already reported, but not if this is an inter-cluster
-	 * migration
+	 * migration. Also only allow WALT update sites.
 	 */
+	if (!(flags & SCHED_CPUFREQ_WALT))
+		return;
 	if (!sched_disable_window_stats &&
 		(rq->load_reported_window == rq->window_start) &&
 		!(flags & SCHED_CPUFREQ_INTERCLUSTER_MIG))
-- 
cgit v1.2.3


From c7128748614ad0ca5bf9533ea9723bdd2ddf8838 Mon Sep 17 00:00:00 2001
From: Pavankumar Kondeti <pkondeti@codeaurora.org>
Date: Mon, 28 May 2018 15:39:20 +0530
Subject: sched/cpupri: Exclude isolated CPUs from the lowest_mask

The cpupri_find() returns the candidate CPUs which are running
lower priority than the waking RT task in the lowest_mask. This
contains isolated CPUs as well. Since the energy aware CPU selection
skips isolated CPUs, no target CPU may be found if all unisolated CPUs
are running higher priority RT tasks. In which case, we fallback to
the default CPU selection algorithm and returns an isolated CPU. This
decision is reversed by select_task_rq() and returns an unisolated
CPU that is busy with other RT tasks. This RT task packing is desired
behavior. However, RT push mechanism pushes the packed RT task to
an isolated CPU. This can be avoided by excluding isolated CPUs from
the lowest_mask returned by cpupri_find().

Change-Id: I75486b3935caf496a638d0333565beffc47fe249
Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
---
 kernel/sched/cpupri.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 14225d5d8617..867cb7877511 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -133,6 +133,8 @@ retry:
 
 		if (lowest_mask) {
 			cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+			cpumask_andnot(lowest_mask, lowest_mask,
+				       cpu_isolated_mask);
 			if (drop_nopreempts)
 				drop_nopreempt_cpus(lowest_mask);
 			/*
-- 
cgit v1.2.3


From ef3fb04c7df43dfa1793e33f764a2581cda96310 Mon Sep 17 00:00:00 2001
From: Vikram Mulukutla <markivx@codeaurora.org>
Date: Mon, 8 May 2017 19:20:22 -0700
Subject: sched: cpufreq: Use sched_clock instead of rq_clock when updating
 schedutil

rq_clock may not be updated often enough for schedutil or other
cpufreq governors to work correctly when it's passed as the
timestamp for a load report. Use sched_clock instead.

[beykerykt]: Switch to sched_ktime_clock()

Change-Id: I745b727870a31da25f766c2c2f37527f568c20da
Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
---
 kernel/sched/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 40da1a509ded..e78a3e867472 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2869,7 +2869,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
 
         data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
         if (data)
-                data->func(data, rq_clock(rq), flags);
+                data->func(data, sched_ktime_clock(), flags);
 }
 
 static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
-- 
cgit v1.2.3


From 4dbe44554792f83b785eed187aa1bcd69e84094c Mon Sep 17 00:00:00 2001
From: Vikram Mulukutla <markivx@codeaurora.org>
Date: Tue, 9 May 2017 17:49:47 -0700
Subject: sched: cpufreq: Use per_cpu_ptr instead of this_cpu_ptr when
 reporting load

We need cpufreq_update_util to report load for the CPU corresponding
to the rq that is passed in as an argument, rather than the CPU executing
cpufreq_update_util.

Change-Id: I8473f230d40928d5920c614760e96fef12745d5a
Signed-off-by: Vikram Mulukutla <markivx@codeaurora.org>
---
 kernel/sched/sched.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e78a3e867472..1196276eddf6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2867,7 +2867,8 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
 	rq->load_reported_window = rq->window_start;
 #endif
 
-        data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+		data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
+						cpu_of(rq)));
         if (data)
                 data->func(data, sched_ktime_clock(), flags);
 }
-- 
cgit v1.2.3


From aee7a16e347b610fbb50d743c42c21e56cf11211 Mon Sep 17 00:00:00 2001
From: Joonwoo Park <joonwoop@codeaurora.org>
Date: Mon, 8 May 2017 19:39:27 -0700
Subject: sched: WALT: increase WALT minimum window size to 20ms

Increase WALT minimum window size to 20ms.  10ms isn't large enough
capture workload's pattern.

[beykerykt}: Adapt for HMP

Change-Id: I4d69577fbfeac2bc23db4ff414939cc51ada30d6
Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
---
 kernel/sched/hmp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c
index 6a403be2ae7c..649d6a437a13 100644
--- a/kernel/sched/hmp.c
+++ b/kernel/sched/hmp.c
@@ -764,8 +764,8 @@ unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
 unsigned int
 min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
 
-/* Min window size (in ns) = 10ms */
-#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC)
+/* Min window size (in ns) = 20ms */
+#define MIN_SCHED_RAVG_WINDOW ((20000000 / TICK_NSEC) * TICK_NSEC)
 
 /* Max window size (in ns) = 1s */
 #define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC)
-- 
cgit v1.2.3


From db74739c86de566ca88a8affa96d2d59cfbdbb98 Mon Sep 17 00:00:00 2001
From: Pavankumar Kondeti <pkondeti@codeaurora.org>
Date: Fri, 14 Sep 2018 13:59:59 +0530
Subject: sched: Don't fail isolation request for an already isolated CPU

When isolating a CPU, a check is performed to see if there is
only 1 active CPU in the system. If that is the case, the
CPU is not isolated. However this check is done before testing
if the requested CPU is already isolated or not. If the
requested CPU is already isolated, there is no need to fail
the isolation even when there is only 1 active CPU in the system.

For example, 0-6 CPUs are isolated on a 8 CPU machine. When
an isolation request comes for CPU6, which is already isolated,
the current code fail the requesting thinking we end up
with no active CPU in the system.

Change-Id: I28fea4ff67ffed82465e5cfa785414069e4a180a
Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
---
 kernel/sched/core.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f73831486c2f..c597f6c61115 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5967,12 +5967,6 @@ int sched_isolate_cpu(int cpu)
 
 	cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
 
-	/* We cannot isolate ALL cpus in the system */
-	if (cpumask_weight(&avail_cpus) == 1) {
-		ret_code = -EINVAL;
-		goto out;
-	}
-
 	if (!cpu_online(cpu)) {
 		ret_code = -EINVAL;
 		goto out;
@@ -5981,6 +5975,13 @@ int sched_isolate_cpu(int cpu)
 	if (++cpu_isolation_vote[cpu] > 1)
 		goto out;
 
+	/* We cannot isolate ALL cpus in the system */
+	if (cpumask_weight(&avail_cpus) == 1) {
+		--cpu_isolation_vote[cpu];
+		ret_code = -EINVAL;
+		goto out;
+	}
+
 	/*
 	 * There is a race between watchdog being enabled by hotplug and
 	 * core isolation disabling the watchdog. When a CPU is hotplugged in
-- 
cgit v1.2.3


From d5cd35f38616a70453da067eda153f5dc5ede3a1 Mon Sep 17 00:00:00 2001
From: Marco Ballesio <balejs@google.com>
Date: Tue, 9 Mar 2021 16:35:45 -0800
Subject: FROMGIT: binder: use EINTR for interrupted wait for work

when interrupted by a signal, binder_wait_for_work currently returns
-ERESTARTSYS. This error code isn't propagated to user space, but a way
to handle interruption due to signals must be provided to code using
this API.

Replace this instance of -ERESTARTSYS with -EINTR, which is propagated
to user space.

Bug: 180989544
(cherry picked from commit 48f10b7ed0c23e2df7b2c752ad1d3559dad007f9
 git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git char-misc-testing)
Signed-off-by: Marco Ballesio <balejs@google.com>
Signed-off-by: Li Li <dualli@google.com>
Test: built, booted, interrupted a worker thread within
Acked-by: Todd Kjos <tkjos@google.com>
Link: https://lore.kernel.org/r/20210316011630.1121213-3-dualli@chromium.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Change-Id: Ie6c7993cab699bc2c1a25a2f9d94b200a1156e5d
---
 drivers/android/binder.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/android/binder.c b/drivers/android/binder.c
index 7c584e2ea476..370f1452710f 100644
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -4062,7 +4062,7 @@ static int binder_wait_for_work(struct binder_thread *thread,
 		binder_inner_proc_lock(proc);
 		list_del_init(&thread->waiting_thread_node);
 		if (signal_pending(current)) {
-			ret = -ERESTARTSYS;
+			ret = -EINTR;
 			break;
 		}
 	}
@@ -4991,7 +4991,7 @@ err:
 	if (thread)
 		thread->looper_need_return = false;
 	wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2);
-	if (ret && ret != -ERESTARTSYS)
+	if (ret && ret != -EINTR)
 		pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret);
 err_unlocked:
 	trace_binder_ioctl_done(ret);
-- 
cgit v1.2.3


From c0fa7577022c4169e1aaaf1bd9e04f63d285beb2 Mon Sep 17 00:00:00 2001
From: Ethan Chen <intervigil@gmail.com>
Date: Sat, 20 Jan 2018 16:35:53 -0800
Subject: sched/walt: Re-add code to allow WALT to function

Change-Id: Ieb1067c5e276f872ed4c722b7d1fabecbdad87e7
---
 include/linux/sched.h        |  9 +++++++++
 include/linux/sched/sysctl.h |  6 ++++++
 include/trace/events/sched.h |  7 +++++--
 kernel/sched/Makefile        |  1 +
 kernel/sched/core.c          | 26 ++++++++++++++++++++++++++
 kernel/sched/cputime.c       |  6 ++++--
 kernel/sched/fair.c          | 22 ++++++++++++++++++++++
 kernel/sched/rt.c            |  3 +++
 kernel/sched/sched.h         |  5 +++++
 kernel/sched/stop_task.c     |  3 +++
 kernel/sched/tune.c          |  2 ++
 kernel/sched/walt.c          |  5 -----
 kernel/sched/walt.h          |  2 ++
 kernel/sysctl.c              | 30 ++++++++++++++++++++++++++++++
 14 files changed, 118 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0e8fff43cc17..4e212132a274 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1668,6 +1668,15 @@ struct task_struct {
 	const struct sched_class *sched_class;
 	struct sched_entity se;
 	struct sched_rt_entity rt;
+#ifdef CONFIG_SCHED_WALT
+	struct ravg ravg;
+	/*
+	 * 'init_load_pct' represents the initial task load assigned to children
+	 * of this task
+	 */
+	u32 init_load_pct;
+	u64 last_sleep_ts;
+#endif
 #ifdef CONFIG_SCHED_HMP
 	struct ravg ravg;
 	/*
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 1e1fcb8791a7..c85fe9872d07 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -41,6 +41,12 @@ extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_sync_hint_enable;
 extern unsigned int sysctl_sched_cstate_aware;
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
+extern unsigned int sysctl_sched_walt_init_task_load_pct;
+extern unsigned int sysctl_sched_walt_cpu_high_irqload;
+#endif
 
 #ifdef CONFIG_SCHED_HMP
 
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 73cd7e502d4c..70d6012c89aa 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -1903,6 +1903,7 @@ TRACE_EVENT(walt_update_task_ravg,
 		__array(	char,	comm,   TASK_COMM_LEN	)
 		__field(	pid_t,	pid			)
 		__field(	pid_t,	cur_pid			)
+		__field(unsigned int,	cur_freq		)
 		__field(	u64,	wallclock		)
 		__field(	u64,	mark_start		)
 		__field(	u64,	delta_m			)
@@ -1930,6 +1931,7 @@ TRACE_EVENT(walt_update_task_ravg,
 		__entry->evt            = evt;
 		__entry->cpu            = rq->cpu;
 		__entry->cur_pid        = rq->curr->pid;
+		__entry->cur_freq       = rq->cur_freq;
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid            = p->pid;
 		__entry->mark_start     = p->ravg.mark_start;
@@ -1948,10 +1950,11 @@ TRACE_EVENT(walt_update_task_ravg,
 		__entry->active_windows	= p->ravg.active_windows;
 	),
 
-	TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu"
+	TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu"
 		" cs %llu ps %llu util %lu cur_window %u prev_window %u active_wins %u"
 		, __entry->wallclock, __entry->win_start, __entry->delta,
-		__entry->evt, __entry->cpu, __entry->cur_pid,
+		__entry->evt, __entry->cpu,
+		__entry->cur_freq, __entry->cur_pid,
 		__entry->pid, __entry->comm, __entry->mark_start,
 		__entry->delta_m, __entry->demand,
 		__entry->sum, __entry->irqtime,
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7dde1b9918e4..ea301717538f 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -19,6 +19,7 @@ obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
 obj-y += wait.o completion.o idle.o sched_avg.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
+obj-$(CONFIG_SCHED_WALT) += walt.o
 obj-$(CONFIG_SCHED_HMP) += hmp.o boost.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c597f6c61115..17c13347d703 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -98,6 +98,7 @@
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
+#include "walt.h"
 
 ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head);
 
@@ -1389,6 +1390,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		p->se.nr_migrations++;
 		perf_event_task_migrate(p);
 
+		walt_fixup_busy_time(p, new_cpu);
 		fixup_busy_time(p, new_cpu);
 	}
 
@@ -2155,6 +2157,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
 
 	raw_spin_lock(&rq->lock);
 	old_load = task_load(p);
+	wallclock = walt_ktime_clock();
+	walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+	walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
 	wallclock = sched_ktime_clock();
 	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
 	update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
@@ -2253,6 +2258,11 @@ static void try_to_wake_up_local(struct task_struct *p)
 		update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
 		update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
 		cpufreq_update_util(rq, 0);
+
+		wallclock = walt_ktime_clock();
+
+		walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+		walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 		note_task_waking(p, wallclock);
 	}
@@ -2385,6 +2395,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #endif
 
 	INIT_LIST_HEAD(&p->se.group_node);
+	walt_init_new_task_load(p);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	p->se.cfs_rq			= NULL;
@@ -2669,6 +2680,7 @@ void wake_up_new_task(struct task_struct *p)
 	struct rq *rq;
 
 	add_new_task_to_grp(p);
+	walt_init_new_task_load(p);
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	p->state = TASK_RUNNING;
 
@@ -2687,6 +2699,7 @@ void wake_up_new_task(struct task_struct *p)
 #endif
 	rq = __task_rq_lock(p);
 	mark_task_starting(p);
+	walt_mark_task_starting(p);
 	update_rq_clock(rq);
 	post_init_entity_util_avg(&p->se);
 	activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
@@ -3235,10 +3248,13 @@ void scheduler_tick(void)
 
 	raw_spin_lock(&rq->lock);
 	old_load = task_load(curr);
+	walt_set_window_start(rq);
 	set_window_start(rq);
 	update_rq_clock(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
 	update_cpu_load_active(rq);
+	walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+			walt_ktime_clock(), 0);
 	calc_global_load_tick(rq);
 	wallclock = sched_ktime_clock();
 	update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
@@ -3602,6 +3618,9 @@ static void __sched notrace __schedule(bool preempt)
 		update_rq_clock(rq);
 
 	next = pick_next_task(rq, prev);
+	wallclock = walt_ktime_clock();
+	walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+	walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
 	rq->clock_skip_update = 0;
@@ -6364,6 +6383,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 	case CPU_UP_PREPARE:
 		raw_spin_lock_irqsave(&rq->lock, flags);
+		walt_set_window_start(rq);
 		set_window_start(rq);
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		rq->calc_load_update = calc_load_update;
@@ -6385,6 +6405,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		sched_ttwu_pending();
 		/* Update our root-domain */
 		raw_spin_lock_irqsave(&rq->lock, flags);
+		walt_migrate_sync_cpu(cpu);
 
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -8580,6 +8601,11 @@ void __init sched_init(void)
 		}
 #endif
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+#ifdef CONFIG_SCHED_WALT
+		rq->cur_irqload = 0;
+		rq->avg_irqload = 0;
+		rq->irqload_ts = 0;
+#endif
 
 		INIT_LIST_HEAD(&rq->cfs_tasks);
 
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index e6ec68c15aa3..cf6729cb46dd 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -6,6 +6,7 @@
 #include <linux/context_tracking.h>
 #include <linux/cpufreq_times.h>
 #include "sched.h"
+#include "walt.h"
 
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -79,9 +80,10 @@ void irqtime_account_irq(struct task_struct *curr)
 
 	irq_time_write_end();
 
-	if (account)
+	if (account) {
+		walt_account_irqtime(cpu, curr, delta, wallclock);
 		sched_account_irqtime(cpu, curr, delta, wallclock);
-	else if (curr != this_cpu_ksoftirqd())
+	} else if (curr != this_cpu_ksoftirqd())
 		sched_account_irqstart(cpu, curr, wallclock);
 
 	local_irq_restore(flags);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a2f52c35c76a..08e608a04f5b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -55,6 +55,12 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 unsigned int sysctl_sched_sync_hint_enable = 1;
 unsigned int sysctl_sched_cstate_aware = 1;
 
+#ifdef CONFIG_SCHED_WALT
+unsigned int sysctl_sched_use_walt_cpu_util = 1;
+unsigned int sysctl_sched_use_walt_task_util = 1;
+__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
+    (10 * NSEC_PER_MSEC);
+#endif
 /*
  * The initial- and re-scaling of tunables is configurable
  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -5961,6 +5967,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 		cfs_rq->h_nr_running++;
+		walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
 		inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
 
 		flags = ENQUEUE_WAKEUP;
@@ -5969,6 +5976,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running++;
+		walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
 		inc_cfs_rq_hmp_stats(cfs_rq, p, 1);
 
 		if (cfs_rq_throttled(cfs_rq))
@@ -6005,6 +6013,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	schedtune_enqueue_task(p, cpu_of(rq));
 
 	if (energy_aware() && !se) {
+		walt_inc_cumulative_runnable_avg(rq, p);
 		if (!task_new && !rq->rd->overutilized &&
 		    cpu_overutilized(rq->cpu)) {
 			rq->rd->overutilized = true;
@@ -6042,6 +6051,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 		cfs_rq->h_nr_running--;
+		walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
 		dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
 
 		/* Don't dequeue parent if it has other entities besides us */
@@ -6062,6 +6072,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running--;
+		walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
 		dec_cfs_rq_hmp_stats(cfs_rq, p, 1);
 
 		if (cfs_rq_throttled(cfs_rq))
@@ -7098,6 +7109,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 
 static inline unsigned long task_util(struct task_struct *p)
 {
+#ifdef CONFIG_SCHED_WALT
+	if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
+		unsigned long demand = p->ravg.demand;
+		return (demand << 10) / walt_ravg_window;
+	}
+#endif
 	return p->se.avg.util_avg;
 }
 
@@ -7656,6 +7673,11 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
 			if (new_util > capacity_orig)
 				continue;
 
+#ifdef CONFIG_SCHED_WALT
+			if (walt_cpu_high_irqload(i))
+				continue;
+#endif
+
 			/*
 			 * Case A) Latency sensitive tasks
 			 *
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 391ec29c71c0..2083a54cdd49 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -12,6 +12,7 @@
 #include <linux/hrtimer.h>
 
 #include "tune.h"
+#include "walt.h"
 
 int sched_rr_timeslice = RR_TIMESLICE;
 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
@@ -1449,6 +1450,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 		rt_se->timeout = 0;
 
 	enqueue_rt_entity(rt_se, flags);
+	walt_inc_cumulative_runnable_avg(rq, p);
 	inc_hmp_sched_stats_rt(rq, p);
 
 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
@@ -1488,6 +1490,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 
 	update_curr_rt(rq);
 	dequeue_rt_entity(rt_se, flags);
+	walt_dec_cumulative_runnable_avg(rq, p);
 	dec_hmp_sched_stats_rt(rq, p);
 
 	dequeue_pushable_task(rq, p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1196276eddf6..284cc86d3ad4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -511,6 +511,10 @@ struct cfs_rq {
 	struct list_head leaf_cfs_rq_list;
 	struct task_group *tg;	/* group that "owns" this runqueue */
 
+#ifdef CONFIG_SCHED_WALT
+	u64 cumulative_runnable_avg;
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
 
 #ifdef CONFIG_SCHED_HMP
@@ -819,6 +823,7 @@ struct rq {
 #endif
 
 #ifdef CONFIG_SCHED_WALT
+	unsigned int cur_freq;
 	u64 cumulative_runnable_avg;
 	u64 window_start;
 	u64 curr_runnable_sum;
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 3278c81cefb1..0fa11d86599e 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,4 +1,5 @@
 #include "sched.h"
+#include "walt.h"
 
 /*
  * stop-task scheduling class.
@@ -78,6 +79,7 @@ static void
 enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
 	add_nr_running(rq, 1);
+	walt_inc_cumulative_runnable_avg(rq, p);
 	inc_hmp_sched_stats_stop(rq, p);
 }
 
@@ -85,6 +87,7 @@ static void
 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
 	sub_nr_running(rq, 1);
+	walt_dec_cumulative_runnable_avg(rq, p);
 	dec_hmp_sched_stats_stop(rq, p);
 }
 
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index d8beda0d74b4..d0ef97f484b1 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -770,6 +770,7 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
 
 static void schedtune_attach(struct cgroup_taskset *tset)
 {
+#ifdef CONFIG_SCHED_HMP
 	struct task_struct *task;
 	struct cgroup_subsys_state *css;
 	struct schedtune *st;
@@ -782,6 +783,7 @@ static void schedtune_attach(struct cgroup_taskset *tset)
 
 	cgroup_taskset_for_each(task, css, tset)
 		sync_cgroup_colocation(task, colocate);
+#endif
 }
 
 static struct cftype files[] = {
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
index 8d25ffbe4fed..911606537808 100644
--- a/kernel/sched/walt.c
+++ b/kernel/sched/walt.c
@@ -57,11 +57,6 @@ static unsigned int sync_cpu;
 static ktime_t ktime_last;
 static bool walt_ktime_suspended;
 
-static unsigned int task_load(struct task_struct *p)
-{
-	return p->ravg.demand;
-}
-
 static inline void fixup_cum_window_demand(struct rq *rq, s64 delta)
 {
 	rq->cum_window_demand += delta;
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
index de7edac43674..34c72a0fcf39 100644
--- a/kernel/sched/walt.h
+++ b/kernel/sched/walt.h
@@ -54,6 +54,8 @@ static inline void walt_set_window_start(struct rq *rq) { }
 static inline void walt_migrate_sync_cpu(int cpu) { }
 static inline void walt_init_cpu_efficiency(void) { }
 static inline u64 walt_ktime_clock(void) { return 0; }
+static inline void walt_account_irqtime(int cpu, struct task_struct *curr,
+		u64 delta, u64 wallclock) { }
 
 #define walt_cpu_high_irqload(cpu) false
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e2aab9cf058b..8980bdffde3d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -575,6 +575,36 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_SCHED_WALT
+	{
+		.procname	= "sched_use_walt_cpu_util",
+		.data		= &sysctl_sched_use_walt_cpu_util,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_use_walt_task_util",
+		.data		= &sysctl_sched_use_walt_task_util,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_walt_init_task_load_pct",
+		.data		= &sysctl_sched_walt_init_task_load_pct,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_walt_cpu_high_irqload",
+		.data		= &sysctl_sched_walt_cpu_high_irqload,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 	{
 		.procname	= "sched_cstate_aware",
 		.data		= &sysctl_sched_cstate_aware,
-- 
cgit v1.2.3


From 33d3b17bfdfb68760bc3fdf79748f8b65ce71978 Mon Sep 17 00:00:00 2001
From: Rashed Abdel-Tawab <rashed@linux.com>
Date: Thu, 18 Jan 2018 12:49:34 -0800
Subject: ARM: dts: msm: Add msm8998 energy model

Squash of commits:
ed6442938f08: Enable EAS in 8998 MTP
3989a0e22e44: Update Energy Model using Muskie
922c6f4b9e8b: Added idle-cost-data to energy model and fixed
busy-cost-data for big cluster cpus

Change-Id: I717eb88204f5e28a1afd494dc484895cc749e2fc
---
 arch/arm/boot/dts/qcom/msm8998.dtsi | 136 ++++++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)

diff --git a/arch/arm/boot/dts/qcom/msm8998.dtsi b/arch/arm/boot/dts/qcom/msm8998.dtsi
index ae664e48afff..89936736e0ff 100644
--- a/arch/arm/boot/dts/qcom/msm8998.dtsi
+++ b/arch/arm/boot/dts/qcom/msm8998.dtsi
@@ -49,6 +49,7 @@
 			qcom,lmh-dcvs = <&lmh_dcvs0>;
 			enable-method = "psci";
 			efficiency = <1024>;
+			sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>;
 			next-level-cache = <&L2_0>;
 			qcom,ea = <&ea0>;
 			L2_0: l2-cache {
@@ -77,6 +78,7 @@
 			qcom,lmh-dcvs = <&lmh_dcvs0>;
 			enable-method = "psci";
 			efficiency = <1024>;
+			sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>;
 			next-level-cache = <&L2_0>;
 			qcom,ea = <&ea1>;
 			L1_I_1: l1-icache {
@@ -100,6 +102,7 @@
 			qcom,lmh-dcvs = <&lmh_dcvs0>;
 			enable-method = "psci";
 			efficiency = <1024>;
+			sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>;
 			next-level-cache = <&L2_0>;
 			qcom,ea = <&ea2>;
 			L1_I_2: l1-icache {
@@ -123,6 +126,7 @@
 			qcom,lmh-dcvs = <&lmh_dcvs0>;
 			enable-method = "psci";
 			efficiency = <1024>;
+			sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>;
 			next-level-cache = <&L2_0>;
 			qcom,ea = <&ea3>;
 			L1_I_3: l1-icache {
@@ -146,6 +150,7 @@
 			qcom,lmh-dcvs = <&lmh_dcvs1>;
 			enable-method = "psci";
 			efficiency = <1536>;
+			sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
 			next-level-cache = <&L2_1>;
 			qcom,ea = <&ea4>;
 			L2_1: l2-cache {
@@ -173,6 +178,7 @@
 			qcom,lmh-dcvs = <&lmh_dcvs1>;
 			enable-method = "psci";
 			efficiency = <1536>;
+			sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
 			next-level-cache = <&L2_1>;
 			qcom,ea = <&ea5>;
 			L1_I_101: l1-icache {
@@ -196,6 +202,7 @@
 			qcom,lmh-dcvs = <&lmh_dcvs1>;
 			enable-method = "psci";
 			efficiency = <1536>;
+			sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
 			next-level-cache = <&L2_1>;
 			qcom,ea = <&ea6>;
 			L1_I_102: l1-icache {
@@ -219,6 +226,7 @@
 			qcom,lmh-dcvs = <&lmh_dcvs1>;
 			enable-method = "psci";
 			efficiency = <1536>;
+			sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
 			next-level-cache = <&L2_1>;
 			qcom,ea = <&ea7>;
 			L1_I_103: l1-icache {
@@ -271,6 +279,134 @@
 				};
 			};
 		};
+		energy-costs {
+			CPU_COST_0: core-cost0 {
+				busy-cost-data = <
+					68  11
+					83  20
+					101 25
+					118 30
+					136 33
+					154 36
+					155 36
+					189 42
+					203 47
+					220 54
+					239 62
+					226 67
+					270 73
+					287 79
+					305 88
+					322 95
+					342 104
+					358 111
+					385 134
+					402 155
+					420 178
+					438 201
+				>;
+				idle-cost-data = <
+					4 4 0
+				>;
+			};
+			CPU_COST_1: core-cost1 {
+				busy-cost-data = <
+					132  56
+					153  76
+					186  91
+					220  105
+					254  118
+					289  135
+					323  150
+					357  162
+					400  181
+					435  196
+					467  214
+					502  229
+					528  248
+					561  280
+					596  316
+					630  354
+					664  392
+					698  439
+					733  495
+					767  565
+					800  622
+					833  691
+					869  792
+					903  889
+					938  1024
+					1020 1141
+					1024 1138
+				>;
+				idle-cost-data = <
+				      10 10 0
+				>;
+			};
+			CLUSTER_COST_0: cluster-cost0 {
+				busy-cost-data = <
+					68  17
+					83  18
+					101 18
+					118 20
+					136 21
+					154 23
+					155 23
+					189 24
+					203 27
+					220 29
+					239 30
+					226 32
+					270 33
+					287 35
+					305 38
+					322 39
+					342 42
+					358 46
+					385 48
+					402 53
+					420 59
+					438 66
+				>;
+				idle-cost-data = <
+					31 31 31 0
+				>;
+			};
+			CLUSTER_COST_1: cluster-cost1 {
+				busy-cost-data = <
+					132  24
+					153  25
+					186  26
+					220  29
+					254  30
+					289  33
+					323  35
+					357  37
+					400  38
+					435  40
+					467  43
+					502  44
+					528  46
+					561  50
+					596  54
+					630  60
+					664  63
+					698  70
+					733  74
+					767  80
+					800  87
+					833  96
+					869  104
+					903  120
+					938  130
+					1020 203
+					1024 203
+				>;
+				idle-cost-data = <
+					50 50 50 0
+				>;
+			};
+		};
 	};
 
 	soc: soc { };
-- 
cgit v1.2.3


From 83dcbae147826d778c4edd1f3259de7f1c28403d Mon Sep 17 00:00:00 2001
From: Siqi Lin <siqilin@google.com>
Date: Thu, 27 Apr 2017 09:43:56 -0700
Subject: ARM: dts: msm: Fix EAS idle-cost-data property length

We need 4 idle-cost-data for CPUs, despite cpu_idle supporting
only 3 different idle states. The idle-cost-data property length
should always be one more entry longer than the number of available
cpu_idle states. The idle-cost-data property has to have the same
length for both CLUSTER_COST_N and CPU_COST_N.

Bug: 37641804
Change-Id: Ic14a6a1ef4409e81c5adc23575f7d1157d6eadce
Signed-off-by: Siqi Lin <siqilin@google.com>
---
 arch/arm/boot/dts/qcom/msm8998.dtsi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/qcom/msm8998.dtsi b/arch/arm/boot/dts/qcom/msm8998.dtsi
index 89936736e0ff..0f8a46a05739 100644
--- a/arch/arm/boot/dts/qcom/msm8998.dtsi
+++ b/arch/arm/boot/dts/qcom/msm8998.dtsi
@@ -306,7 +306,7 @@
 					438 201
 				>;
 				idle-cost-data = <
-					4 4 0
+					4 4 4 0
 				>;
 			};
 			CPU_COST_1: core-cost1 {
@@ -340,7 +340,7 @@
 					1024 1138
 				>;
 				idle-cost-data = <
-				      10 10 0
+				      10 10 10 0
 				>;
 			};
 			CLUSTER_COST_0: cluster-cost0 {
-- 
cgit v1.2.3


From ab88411382f7c022679edcaa1e90b51947f91401 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 18 May 2017 11:38:57 +0100
Subject: ARM: dts: msm: fix EM to be monotonically increasing

Change-Id: Iad2e3882a2e9d7dbbfd80cf485bbb1f0e664b04f
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 arch/arm/boot/dts/qcom/msm8998.dtsi | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/arm/boot/dts/qcom/msm8998.dtsi b/arch/arm/boot/dts/qcom/msm8998.dtsi
index 0f8a46a05739..e6c4c6f1864f 100644
--- a/arch/arm/boot/dts/qcom/msm8998.dtsi
+++ b/arch/arm/boot/dts/qcom/msm8998.dtsi
@@ -287,13 +287,13 @@
 					101 25
 					118 30
 					136 33
-					154 36
+					154 35
 					155 36
 					189 42
 					203 47
 					220 54
 					239 62
-					226 67
+					255 67
 					270 73
 					287 79
 					305 88
@@ -350,13 +350,13 @@
 					101 18
 					118 20
 					136 21
-					154 23
+					154 22
 					155 23
 					189 24
 					203 27
 					220 29
 					239 30
-					226 32
+					255 32
 					270 33
 					287 35
 					305 38
@@ -399,7 +399,7 @@
 					869  104
 					903  120
 					938  130
-					1020 203
+					1020 200
 					1024 203
 				>;
 				idle-cost-data = <
-- 
cgit v1.2.3


From 72f13941085b9b9f3b8cc6f6d84016ca44093265 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Thu, 18 May 2017 11:41:03 +0100
Subject: ARM: dts: msm: fix CPU's idle-cost-data

CPU idle states are mapped into EAS energy model data structures
according to this table:

 + cpu_idle_status
 |  + idle-cost-data index
 |  |  + meaning
 |  |  |                        + expected energy cost
 |  |  |                        |
-1  0: CPU active               CPU energy > 0
 0  1: CPU WFI                  CPU energy > 0
 1  2: CPU off (cluster on)     CPU energy = 0
 2  3: CPU off (cluster off)    CPU energy = 0

Change-Id: I4b51bb74cb96c265731f3872c95947474db973ac
Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
---
 arch/arm/boot/dts/qcom/msm8998.dtsi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/qcom/msm8998.dtsi b/arch/arm/boot/dts/qcom/msm8998.dtsi
index e6c4c6f1864f..fb601f9a7bbc 100644
--- a/arch/arm/boot/dts/qcom/msm8998.dtsi
+++ b/arch/arm/boot/dts/qcom/msm8998.dtsi
@@ -306,7 +306,7 @@
 					438 201
 				>;
 				idle-cost-data = <
-					4 4 4 0
+					4 4 0 0
 				>;
 			};
 			CPU_COST_1: core-cost1 {
@@ -340,7 +340,7 @@
 					1024 1138
 				>;
 				idle-cost-data = <
-				      10 10 10 0
+				      10 10 0 0
 				>;
 			};
 			CLUSTER_COST_0: cluster-cost0 {
-- 
cgit v1.2.3


From b65c91c9aa14ce6e8d61d488878c0dc734508a76 Mon Sep 17 00:00:00 2001
From: Andres Oportus <andresoportus@google.com>
Date: Fri, 25 Aug 2017 16:46:36 -0700
Subject: ARM: dts: msm: add HW CPU's busy-cost-data for additional freqs

Initial Enery Model was calculated with a device including
less number of available frequencies.  This change adds
the missing values, note that all performance values had to be
updated so they would be re-normalized to 0-1024.

Bug: 64837462
Test: YouTube did not have energy regression

Change-Id: I2b4c62d06e39fe0da524af96568187042664d62a
Signed-off-by: Andres Oportus <andresoportus@google.com>
---
 arch/arm/boot/dts/qcom/msm8998.dtsi | 204 +++++++++++++++++++-----------------
 1 file changed, 106 insertions(+), 98 deletions(-)

diff --git a/arch/arm/boot/dts/qcom/msm8998.dtsi b/arch/arm/boot/dts/qcom/msm8998.dtsi
index fb601f9a7bbc..b9a38ddc5ba8 100644
--- a/arch/arm/boot/dts/qcom/msm8998.dtsi
+++ b/arch/arm/boot/dts/qcom/msm8998.dtsi
@@ -282,28 +282,28 @@
 		energy-costs {
 			CPU_COST_0: core-cost0 {
 				busy-cost-data = <
-					68  11
-					83  20
-					101 25
-					118 30
-					136 33
-					154 35
-					155 36
-					189 42
-					203 47
-					220 54
-					239 62
-					255 67
-					270 73
-					287 79
-					305 88
-					322 95
-					342 104
-					358 111
-					385 134
-					402 155
-					420 178
-					438 201
+					65  11
+					80  20
+					96  25
+					113 30
+					130 33
+					147 35
+					164 36
+					181 42
+					194 47
+					211 54
+					228 62
+					243 67
+					258 73
+					275 79
+					292 88
+					308 95
+					326 104
+					342 111
+					368 134
+					384 155
+					401 178
+					419 201
 				>;
 				idle-cost-data = <
 					4 4 0 0
@@ -311,62 +311,66 @@
 			};
 			CPU_COST_1: core-cost1 {
 				busy-cost-data = <
-					132  56
-					153  76
-					186  91
-					220  105
-					254  118
-					289  135
-					323  150
-					357  162
-					400  181
-					435  196
-					467  214
-					502  229
-					528  248
-					561  280
-					596  316
-					630  354
-					664  392
-					698  439
-					733  495
-					767  565
-					800  622
-					833  691
-					869  792
-					903  889
-					938  1024
-					1020 1141
-					1024 1138
+					129  56
+					148  76
+					182  91
+					216  105
+					247  118
+					278  135
+					312  150
+					344  162
+					391  181
+					419  196
+					453  214
+					487  229
+					509  248
+					546  280
+					581  316
+					615  354
+					650  392
+					676  439
+					712  495
+					739  565
+					776  622
+					803  691
+					834  792
+					881  889
+					914  1059
+					957  1244
+					975  1375
+					996  1549
+					1016 1617
+					1021 1677
+					1024 1683
 				>;
 				idle-cost-data = <
-				      10 10 0 0
+					10 10 0 0
 				>;
 			};
 			CLUSTER_COST_0: cluster-cost0 {
 				busy-cost-data = <
-					68  17
-					83  18
-					101 18
-					118 20
-					136 21
-					154 22
-					155 23
-					189 24
-					203 27
-					220 29
-					239 30
-					255 32
-					270 33
-					287 35
-					305 38
-					322 39
-					342 42
-					358 46
-					385 48
-					402 53
-					420 59
-					438 66
+					65  17
+					80  18
+					96  18
+					113 20
+					130 21
+					147 22
+					164 23
+					181 24
+					194 27
+					211 29
+					228 30
+					243 32
+					258 33
+					275 35
+					292 38
+					308 39
+					326 42
+					342 46
+					368 48
+					384 53
+					401 59
+					419 66
 				>;
 				idle-cost-data = <
 					31 31 31 0
@@ -374,32 +378,36 @@
 			};
 			CLUSTER_COST_1: cluster-cost1 {
 				busy-cost-data = <
-					132  24
-					153  25
-					186  26
-					220  29
-					254  30
-					289  33
-					323  35
-					357  37
-					400  38
-					435  40
-					467  43
-					502  44
-					528  46
-					561  50
-					596  54
-					630  60
-					664  63
-					698  70
-					733  74
-					767  80
-					800  87
-					833  96
-					869  104
-					903  120
-					938  130
-					1020 200
+					129  24
+					148  25
+					182  26
+					216  29
+					247  30
+					278  33
+					312  35
+					344  37
+					391  38
+					419  40
+					453  43
+					487  44
+					509  46
+					546  50
+					581  54
+					615  60
+					650  63
+					676  70
+					712  74
+					739  80
+					776  87
+					803  96
+					834  104
+					881  120
+					914  130
+					957  171
+					975  178
+					996  185
+					1016 200
+					1021 202
 					1024 203
 				>;
 				idle-cost-data = <
-- 
cgit v1.2.3


From 9539942cb065e9ec5749a89077b12fc3a0c51b0a Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 18 May 2017 22:41:33 -0700
Subject: FROMLIST: cpufreq: Make iowait boost a policy option

Make iowait boost a cpufreq policy option and enable it for intel_pstate
cpufreq driver. Governors like schedutil can use it to determine if
boosting for tasks that wake up with p->in_iowait set is needed.

Bug: 38010527
Link: https://lkml.org/lkml/2017/5/19/43
Change-Id: Icf59e75fbe731dc67abb28fb837f7bb0cd5ec6cc
Signed-off-by: Joel Fernandes <joelaf@google.com>
---
 drivers/cpufreq/intel_pstate.c | 1 +
 include/linux/cpufreq.h        | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 15fcf2cac971..53226f33ea98 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1187,6 +1187,7 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
 
 	policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling;
 	policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
+	policy->iowait_boost_enable = true;
 
 	/* cpuinfo and default policy values */
 	policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 9302d016b89f..8e9d08dfbd18 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -123,6 +123,9 @@ struct cpufreq_policy {
 	unsigned int		up_transition_delay_us;
 	unsigned int		down_transition_delay_us;
 
+	/* Boost switch for tasks with p->in_iowait set */
+	bool iowait_boost_enable;
+
 	 /* Cached frequency lookup from cpufreq_driver_resolve_freq. */
 	unsigned int cached_target_freq;
 	int cached_resolved_idx;
-- 
cgit v1.2.3


From ff383d94478af0bb62f828bad550e42681a7176e Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 18 May 2017 22:46:10 -0700
Subject: FROMLIST: sched: Make iowait_boost optional in schedutil

We should apply the iowait boost only if cpufreq policy has iowait boost
enabled. Also make it a schedutil configuration from sysfs so it can be
turned on/off if needed (by default initialize it to the policy value).

For systems that don't need/want it enabled, such as those on arm64
based mobile devices that are battery operated, it saves energy when the
cpufreq driver policy doesn't have it enabled (details below):

Here are some results for energy measurements collected running a
YouTube video for 30 seconds:
Before: 8.042533 mWh
After: 7.948377 mWh
Energy savings is ~1.2%

Bug: 38010527
Link: https://lkml.org/lkml/2017/5/19/42
Change-Id: If124076ad0c16ade369253840dedfbf870aff927
Signed-off-by: Joel Fernandes <joelaf@google.com>
---
 kernel/sched/cpufreq_schedutil.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 6c84b4d28914..6effb44aeb30 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -33,6 +33,7 @@ struct sugov_tunables {
 	struct gov_attr_set attr_set;
 	unsigned int up_rate_limit_us;
 	unsigned int down_rate_limit_us;
+	bool iowait_boost_enable;
 };
 
 struct sugov_policy {
@@ -228,6 +229,11 @@ static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time)
 static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
 				   unsigned int flags)
 {
+	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+
+	if (!sg_policy->tunables->iowait_boost_enable)
+		return;
+
 	if (flags & SCHED_CPUFREQ_IOWAIT) {
 		if (sg_cpu->iowait_boost_pending)
 			return;
@@ -510,12 +516,36 @@ static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set,
 	return count;
 }
 
+static ssize_t iowait_boost_enable_show(struct gov_attr_set *attr_set,
+					char *buf)
+{
+	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+	return sprintf(buf, "%u\n", tunables->iowait_boost_enable);
+}
+
+static ssize_t iowait_boost_enable_store(struct gov_attr_set *attr_set,
+					 const char *buf, size_t count)
+{
+	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+	bool enable;
+
+	if (kstrtobool(buf, &enable))
+		return -EINVAL;
+
+	tunables->iowait_boost_enable = enable;
+
+	return count;
+}
+
 static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us);
 static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us);
+static struct governor_attr iowait_boost_enable = __ATTR_RW(iowait_boost_enable);
 
 static struct attribute *sugov_attributes[] = {
 	&up_rate_limit_us.attr,
 	&down_rate_limit_us.attr,
+	&iowait_boost_enable.attr,
 	NULL
 };
 
@@ -675,6 +705,8 @@ static int sugov_init(struct cpufreq_policy *policy)
                 }
 	}
 
+	tunables->iowait_boost_enable = policy->iowait_boost_enable;
+
 	policy->governor_data = sg_policy;
 	sg_policy->tunables = tunables;
 
-- 
cgit v1.2.3


From ebdb82f7b34aeab34623d7a5e4dd673fc2807842 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 20 Jul 2017 23:46:56 -0700
Subject: sched/fair: Skip frequency updates if CPU about to idle

If CPU is about to idle, prevent a frequency update. With the number of
schedutil governor wake ups are reduced by more than half on a test
playing bluetooth audio.

Test: sugov wake ups drop by more than half when playing music with
screen off (476 / 1092)

Bug: 64689959

Change-Id: I400026557b4134c0ac77f51c79610a96eb985b4a
Signed-off-by: Joel Fernandes <joelaf@google.com>
---
 kernel/sched/fair.c  | 18 +++++++++++++++---
 kernel/sched/sched.h |  1 +
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 08e608a04f5b..ee5f8e686a31 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4332,6 +4332,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
  */
 #define UPDATE_TG	0x1
 #define SKIP_AGE_LOAD	0x2
+#define SKIP_CPUFREQ	0x4
 
 /* Update task and its cfs_rq load average */
 static inline void update_load_avg(struct sched_entity *se, int flags)
@@ -4352,7 +4353,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
 			  cfs_rq->curr == se, NULL);
 	}
 
-	decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
+	decayed  = update_cfs_rq_load_avg(now, cfs_rq, !(flags & SKIP_CPUFREQ));
 	decayed |= propagate_entity_load_avg(se);
 
 	if (decayed && (flags & UPDATE_TG))
@@ -4528,6 +4529,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 
 #define UPDATE_TG	0x0
 #define SKIP_AGE_LOAD	0x0
+#define SKIP_CPUFREQ	0x3
 
 static inline void update_load_avg(struct sched_entity *se, int not_used1){}
 static inline void
@@ -4750,6 +4752,8 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+	int update_flags;
+
 	/*
 	 * Update run-time statistics of the 'current'.
 	 */
@@ -4763,7 +4767,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 *   - For group entity, update its weight to reflect the new share
 	 *     of its group cfs_rq.
 	 */
-	update_load_avg(se, UPDATE_TG);
+	update_flags = UPDATE_TG;
+
+	if (flags & DEQUEUE_IDLE)
+		update_flags |= SKIP_CPUFREQ;
+
+	update_load_avg(se, update_flags);
 	dequeue_entity_load_avg(cfs_rq, se);
 
 	update_stats_dequeue(cfs_rq, se);
@@ -6038,6 +6047,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	struct sched_entity *se = &p->se;
 	int task_sleep = flags & DEQUEUE_SLEEP;
 
+	if (task_sleep && rq->nr_running == 1)
+		flags |= DEQUEUE_IDLE;
+
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
@@ -6078,7 +6090,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_load_avg(se, UPDATE_TG);
+		update_load_avg(se, UPDATE_TG | (flags & DEQUEUE_IDLE));
 		update_cfs_shares(se);
 	}
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 284cc86d3ad4..bafa2931c898 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2032,6 +2032,7 @@ static const u32 prio_to_wmult[40] = {
 #define DEQUEUE_SLEEP		0x01
 #define DEQUEUE_SAVE		0x02 /* matches ENQUEUE_RESTORE */
 #define DEQUEUE_MOVE		0x04 /* matches ENQUEUE_MOVE */
+#define DEQUEUE_IDLE		0x80 /* The last dequeue before IDLE */
 
 #define ENQUEUE_WAKEUP		0x01
 #define ENQUEUE_RESTORE		0x02
-- 
cgit v1.2.3


From b775cb29f66382f04ba4c1e7ad385081a020269b Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Tue, 27 Feb 2018 15:29:09 +0000
Subject: ANDROID: Move schedtune en/dequeue before schedutil update triggers

CPU rq util updates happen when rq signals are updated as part of
enqueue and dequeue operations. Doing these updates triggers a call to
the registered util update handler, which takes schedtune boosting
into account. Enqueueing the task in the correct schedtune group after
this happens means that we will potentially not see the boost for an
entire throttle period.

Move the enqueue/dequeue operations for schedtune before the signal
updates which can trigger OPP changes.

Change-Id: I4236e6b194bc5daad32ff33067d4be1987996780
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
---
 kernel/sched/fair.c | 63 +++++++++++++++++++++++++----------------------------
 kernel/sched/rt.c   |  8 +++++++
 2 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ee5f8e686a31..ef6046d3a016 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5951,6 +5951,25 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	struct sched_entity *se = &p->se;
 #ifdef CONFIG_SMP
 	int task_new = flags & ENQUEUE_WAKEUP_NEW;
+
+	/*
+	 * Update SchedTune accounting.
+	 *
+	 * We do it before updating the CPU capacity to ensure the
+	 * boost value of the current task is accounted for in the
+	 * selection of the OPP.
+	 *
+	 * We do it also in the case where we enqueue a throttled task;
+	 * we could argue that a throttled task should not boost a CPU,
+	 * however:
+	 * a) properly implementing CPU boosting considering throttled
+	 *    tasks will increase a lot the complexity of the solution
+	 * b) it's not easy to quantify the benefits introduced by
+	 *    such a more complex solution.
+	 * Thus, for the time being we go for the simple solution and boost
+	 * also for throttled RQs.
+	 */
+	schedtune_enqueue_task(p, cpu_of(rq));
 #endif
 
 	/*
@@ -6001,26 +6020,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	}
 
 #ifdef CONFIG_SMP
-
-	/*
-	 * Update SchedTune accounting.
-	 *
-	 * We do it before updating the CPU capacity to ensure the
-	 * boost value of the current task is accounted for in the
-	 * selection of the OPP.
-	 *
-	 * We do it also in the case where we enqueue a throttled task;
-	 * we could argue that a throttled task should not boost a CPU,
-	 * however:
-	 * a) properly implementing CPU boosting considering throttled
-	 *    tasks will increase a lot the complexity of the solution
-	 * b) it's not easy to quantify the benefits introduced by
-	 *    such a more complex solution.
-	 * Thus, for the time being we go for the simple solution and boost
-	 * also for throttled RQs.
-	 */
-	schedtune_enqueue_task(p, cpu_of(rq));
-
 	if (energy_aware() && !se) {
 		walt_inc_cumulative_runnable_avg(rq, p);
 		if (!task_new && !rq->rd->overutilized &&
@@ -6050,6 +6049,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (task_sleep && rq->nr_running == 1)
 		flags |= DEQUEUE_IDLE;
 
+#ifdef CONFIG_SMP
+	/*
+	 * Update SchedTune accounting
+	 *
+	 * We do it before updating the CPU capacity to ensure the
+	 * boost value of the current task is accounted for in the
+	 * selection of the OPP.
+	 */
+	schedtune_dequeue_task(p, cpu_of(rq));
+#endif
+
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
@@ -6099,19 +6109,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		dec_rq_hmp_stats(rq, p, 1);
 	}
 
-#ifdef CONFIG_SMP
-
-	/*
-	 * Update SchedTune accounting
-	 *
-	 * We do it before updating the CPU capacity to ensure the
-	 * boost value of the current task is accounted for in the
-	 * selection of the OPP.
-	 */
-	schedtune_dequeue_task(p, cpu_of(rq));
-
-#endif /* CONFIG_SMP */
-
 	hrtick_update(rq);
 }
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2083a54cdd49..ac81704e14d9 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1446,6 +1446,10 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
 
+#ifdef CONFIG_SMP
+	schedtune_enqueue_task(p, cpu_of(rq));
+#endif
+
 	if (flags & ENQUEUE_WAKEUP)
 		rt_se->timeout = 0;
 
@@ -1488,6 +1492,10 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
 
+#ifdef CONFIG_SMP
+	schedtune_dequeue_task(p, cpu_of(rq));
+#endif
+
 	update_curr_rt(rq);
 	dequeue_rt_entity(rt_se, flags);
 	walt_dec_cumulative_runnable_avg(rq, p);
-- 
cgit v1.2.3


From 891a63210e1d65ed226050ce7921dcec210a671a Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Fri, 18 Aug 2017 13:29:46 -0700
Subject: sched/fair: Fix issue where frequency update not skipped

This patch fixes one of the infrequent conditions in
commit 54b6baeca500 ("sched/fair: Skip frequency updates if CPU about to idle")
where we could have skipped a frequency update. The fix is to use the
correct flag which skips freq updates.

Note that this is a rare issue (can show up only during CFS throttling)
and even then we just do an additional frequency update which we were
doing anyway before the above patch.

Bug: 64689959

Change-Id: I0117442f395cea932ad56617065151bdeb9a3b53
Signed-off-by: Joel Fernandes <joelaf@google.com>
---
 kernel/sched/fair.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ef6046d3a016..2015cf048a44 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4529,7 +4529,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 
 #define UPDATE_TG	0x0
 #define SKIP_AGE_LOAD	0x0
-#define SKIP_CPUFREQ	0x3
+#define SKIP_CPUFREQ	0x0
 
 static inline void update_load_avg(struct sched_entity *se, int not_used1){}
 static inline void
@@ -6092,6 +6092,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	}
 
 	for_each_sched_entity(se) {
+		int update_flags;
+
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running--;
 		walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
@@ -6100,7 +6102,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_load_avg(se, UPDATE_TG | (flags & DEQUEUE_IDLE));
+		update_flags = UPDATE_TG;
+
+		if (flags & DEQUEUE_IDLE)
+			update_flags |= SKIP_CPUFREQ;
+
+		update_load_avg(se, update_flags);
 		update_cfs_shares(se);
 	}
 
-- 
cgit v1.2.3


From 629bfed360f99e297f7d8042955710aadbde2123 Mon Sep 17 00:00:00 2001
From: Raghavendra Kakarla <rkakarla@codeaurora.org>
Date: Fri, 1 Jun 2018 19:06:53 +0530
Subject: kernel: power: qos: remove check for core isolation while cluster
 LPMs

Since all cores in a cluster are in isolation, PMQoS latency constraint
set by clock driver to switch PLL is ignored. So, Cluster enter to L2PC
and SPM is trying to disable the PLL and at same time clock driver
trying to switch the PLL from other cluster which leads to the
synchronization issues.

Fix is although all cores are in isolation, honor PMQoS request
for cluster LPMs.

Change-Id: I4296e16ef4e9046d1fbe3b7378e9f61a2f11c74d
Signed-off-by: Raghavendra Kakarla <rkakarla@codeaurora.org>
---
 kernel/power/qos.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 49dc710d4a3a..3e3ae5ed8100 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -477,8 +477,6 @@ int pm_qos_request_for_cpumask(int pm_qos_class, struct cpumask *mask)
 	val = c->default_value;
 
 	for_each_cpu(cpu, mask) {
-		if (cpu_isolated(cpu))
-			continue;
 
 		switch (c->type) {
 		case PM_QOS_MIN:
-- 
cgit v1.2.3


From 82d3f23d6dc53c564c1c8550f9ee6ac72f85c004 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@linux.alibaba.com>
Date: Wed, 20 Jun 2018 18:18:33 +0800
Subject: sched/fair: Fix bandwidth timer clock drift condition

commit 512ac999d2755d2b7109e996a76b6fb8b888631d upstream.

I noticed that cgroup task groups constantly get throttled even
if they have low CPU usage, this causes some jitters on the response
time to some of our business containers when enabling CPU quotas.

It's very simple to reproduce:

  mkdir /sys/fs/cgroup/cpu/test
  cd /sys/fs/cgroup/cpu/test
  echo 100000 > cpu.cfs_quota_us
  echo $$ > tasks

then repeat:

  cat cpu.stat | grep nr_throttled  # nr_throttled will increase steadily

After some analysis, we found that cfs_rq::runtime_remaining will
be cleared by expire_cfs_rq_runtime() due to two equal but stale
"cfs_{b|q}->runtime_expires" after period timer is re-armed.

The current condition to judge clock drift in expire_cfs_rq_runtime()
is wrong, the two runtime_expires are actually the same when clock
drift happens, so this condtion can never hit. The orginal design was
correctly done by this commit:

  a9cf55b28610 ("sched: Expire invalid runtime")

... but was changed to be the current implementation due to its locking bug.

This patch introduces another way, it adds a new field in both structures
cfs_rq and cfs_bandwidth to record the expiration update sequence, and
uses them to figure out if clock drift happens (true if they are equal).

Change-Id: I8168fe3b45785643536f289ea823d1a62d9d8ab2
Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[alakeshh: backport: Fixed merge conflicts:
 - sched.h: Fix the indentation and order in which the variables are
   declared to match with coding style of the existing code in 4.14
   Struct members of same type were declared in separate lines in
   upstream patch which has been changed back to having multiple
   members of same type in the same line.
   e.g. int a; int b; ->  int a, b; ]
Signed-off-by: Alakesh Haloi <alakeshh@amazon.com>
Reviewed-by: Ben Segall <bsegall@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org> # 4.14.x
Fixes: 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period")
Link: http://lkml.kernel.org/r/20180620101834.24455-1-xlpang@linux.alibaba.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/sched/fair.c  | 14 ++++++++------
 kernel/sched/sched.h |  4 +++-
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2015cf048a44..63f9ad66cf11 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5068,6 +5068,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
 	now = sched_clock_cpu(smp_processor_id());
 	cfs_b->runtime = cfs_b->quota;
 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+	cfs_b->expires_seq++;
 }
 
 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -5090,6 +5091,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	struct task_group *tg = cfs_rq->tg;
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
 	u64 amount = 0, min_amount, expires;
+	int expires_seq;
 
 	/* note: this is a positive sum as runtime_remaining <= 0 */
 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -5106,6 +5108,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 			cfs_b->idle = 0;
 		}
 	}
+	expires_seq = cfs_b->expires_seq;
 	expires = cfs_b->runtime_expires;
 	raw_spin_unlock(&cfs_b->lock);
 
@@ -5115,8 +5118,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	 * spread between our sched_clock and the one on which runtime was
 	 * issued.
 	 */
-	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+	if (cfs_rq->expires_seq != expires_seq) {
+		cfs_rq->expires_seq = expires_seq;
 		cfs_rq->runtime_expires = expires;
+	}
 
 	return cfs_rq->runtime_remaining > 0;
 }
@@ -5142,12 +5147,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	 * has not truly expired.
 	 *
 	 * Fortunately we can check determine whether this the case by checking
-	 * whether the global deadline has advanced. It is valid to compare
-	 * cfs_b->runtime_expires without any locks since we only care about
-	 * exact equality, so a partial write will still work.
+	 * whether the global deadline(cfs_b->expires_seq) has advanced.
 	 */
-
-	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
+	if (cfs_rq->expires_seq == cfs_b->expires_seq) {
 		/* extend local deadline, drift is bounded above by 2 ticks */
 		cfs_rq->runtime_expires += TICK_NSEC;
 	} else {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bafa2931c898..eaf5d3af2e92 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -227,8 +227,9 @@ struct cfs_bandwidth {
 	u64 quota, runtime;
 	s64 hierarchical_quota;
 	u64 runtime_expires;
+	int expires_seq;
 
-	int idle, period_active;
+	short idle, period_active;
 	struct hrtimer period_timer, slack_timer;
 	struct list_head throttled_cfs_rq;
 
@@ -522,6 +523,7 @@ struct cfs_rq {
 #endif
 
 	int runtime_enabled;
+	int expires_seq;
 	u64 runtime_expires;
 	s64 runtime_remaining;
 
-- 
cgit v1.2.3


From b933e4d37bc023d27c7394626669bae0a201da52 Mon Sep 17 00:00:00 2001
From: Dave Chiluk <chiluk+linux@indeed.com>
Date: Tue, 23 Jul 2019 11:44:26 -0500
Subject: sched/fair: Fix low cpu usage with high throttling by removing
 expiration of cpu-local slices

commit de53fd7aedb100f03e5d2231cfce0e4993282425 upstream.

It has been observed, that highly-threaded, non-cpu-bound applications
running under cpu.cfs_quota_us constraints can hit a high percentage of
periods throttled while simultaneously not consuming the allocated
amount of quota. This use case is typical of user-interactive non-cpu
bound applications, such as those running in kubernetes or mesos when
run on multiple cpu cores.

This has been root caused to cpu-local run queue being allocated per cpu
bandwidth slices, and then not fully using that slice within the period.
At which point the slice and quota expires. This expiration of unused
slice results in applications not being able to utilize the quota for
which they are allocated.

The non-expiration of per-cpu slices was recently fixed by
'commit 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift
condition")'. Prior to that it appears that this had been broken since
at least 'commit 51f2176d74ac ("sched/fair: Fix unlocked reads of some
cfs_b->quota/period")' which was introduced in v3.16-rc1 in 2014. That
added the following conditional which resulted in slices never being
expired.

if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
	/* extend local deadline, drift is bounded above by 2 ticks */
	cfs_rq->runtime_expires += TICK_NSEC;

Because this was broken for nearly 5 years, and has recently been fixed
and is now being noticed by many users running kubernetes
(https://github.com/kubernetes/kubernetes/issues/67577) it is my opinion
that the mechanisms around expiring runtime should be removed
altogether.

This allows quota already allocated to per-cpu run-queues to live longer
than the period boundary. This allows threads on runqueues that do not
use much CPU to continue to use their remaining slice over a longer
period of time than cpu.cfs_period_us. However, this helps prevent the
above condition of hitting throttling while also not fully utilizing
your cpu quota.

This theoretically allows a machine to use slightly more than its
allotted quota in some periods. This overflow would be bounded by the
remaining quota left on each per-cpu runqueueu. This is typically no
more than min_cfs_rq_runtime=1ms per cpu. For CPU bound tasks this will
change nothing, as they should theoretically fully utilize all of their
quota in each period. For user-interactive tasks as described above this
provides a much better user/application experience as their cpu
utilization will more closely match the amount they requested when they
hit throttling. This means that cpu limits no longer strictly apply per
period for non-cpu bound applications, but that they are still accurate
over longer timeframes.

This greatly improves performance of high-thread-count, non-cpu bound
applications with low cfs_quota_us allocation on high-core-count
machines. In the case of an artificial testcase (10ms/100ms of quota on
80 CPU machine), this commit resulted in almost 30x performance
improvement, while still maintaining correct cpu quota restrictions.
That testcase is available at https://github.com/indeedeng/fibtest.

Fixes: 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift condition")
Change-Id: I7d7a39fb554ec0c31f9381f492165f43c70b3924
Signed-off-by: Dave Chiluk <chiluk+linux@indeed.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Phil Auld <pauld@redhat.com>
Reviewed-by: Ben Segall <bsegall@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: John Hammond <jhammond@indeed.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kyle Anderson <kwa@yelp.com>
Cc: Gabriel Munos <gmunoz@netflix.com>
Cc: Peter Oskolkov <posk@posk.io>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Brendan Gregg <bgregg@netflix.com>
Link: https://lkml.kernel.org/r/1563900266-19734-2-git-send-email-chiluk+linux@indeed.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/scheduler/sched-bwc.txt | 45 ++++++++++++++++++++++
 kernel/sched/fair.c                   | 70 ++++-------------------------------
 kernel/sched/sched.h                  |  4 --
 3 files changed, 52 insertions(+), 67 deletions(-)

diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt
index f6b1873f68ab..de583fbbfe42 100644
--- a/Documentation/scheduler/sched-bwc.txt
+++ b/Documentation/scheduler/sched-bwc.txt
@@ -90,6 +90,51 @@ There are two ways in which a group may become throttled:
 In case b) above, even though the child may have runtime remaining it will not
 be allowed to until the parent's runtime is refreshed.
 
+CFS Bandwidth Quota Caveats
+---------------------------
+Once a slice is assigned to a cpu it does not expire.  However all but 1ms of
+the slice may be returned to the global pool if all threads on that cpu become
+unrunnable. This is configured at compile time by the min_cfs_rq_runtime
+variable. This is a performance tweak that helps prevent added contention on
+the global lock.
+
+The fact that cpu-local slices do not expire results in some interesting corner
+cases that should be understood.
+
+For cgroup cpu constrained applications that are cpu limited this is a
+relatively moot point because they will naturally consume the entirety of their
+quota as well as the entirety of each cpu-local slice in each period. As a
+result it is expected that nr_periods roughly equal nr_throttled, and that
+cpuacct.usage will increase roughly equal to cfs_quota_us in each period.
+
+For highly-threaded, non-cpu bound applications this non-expiration nuance
+allows applications to briefly burst past their quota limits by the amount of
+unused slice on each cpu that the task group is running on (typically at most
+1ms per cpu or as defined by min_cfs_rq_runtime).  This slight burst only
+applies if quota had been assigned to a cpu and then not fully used or returned
+in previous periods. This burst amount will not be transferred between cores.
+As a result, this mechanism still strictly limits the task group to quota
+average usage, albeit over a longer time window than a single period.  This
+also limits the burst ability to no more than 1ms per cpu.  This provides
+better more predictable user experience for highly threaded applications with
+small quota limits on high core count machines. It also eliminates the
+propensity to throttle these applications while simultanously using less than
+quota amounts of cpu. Another way to say this, is that by allowing the unused
+portion of a slice to remain valid across periods we have decreased the
+possibility of wastefully expiring quota on cpu-local silos that don't need a
+full slice's amount of cpu time.
+
+The interaction between cpu-bound and non-cpu-bound-interactive applications
+should also be considered, especially when single core usage hits 100%. If you
+gave each of these applications half of a cpu-core and they both got scheduled
+on the same CPU it is theoretically possible that the non-cpu bound application
+will use up to 1ms additional quota in some periods, thereby preventing the
+cpu-bound application from fully using its quota by that same amount. In these
+instances it will be up to the CFS algorithm (see sched-design-CFS.rst) to
+decide which application is chosen to run, as they will both be runnable and
+have remaining quota. This runtime discrepancy will be made up in the following
+periods when the interactive application idles.
+
 Examples
 --------
 1. Limit a group to 1 CPU worth of runtime.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 63f9ad66cf11..266fc95f6c0f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5067,8 +5067,6 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
 
 	now = sched_clock_cpu(smp_processor_id());
 	cfs_b->runtime = cfs_b->quota;
-	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
-	cfs_b->expires_seq++;
 }
 
 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -5090,8 +5088,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	struct task_group *tg = cfs_rq->tg;
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-	u64 amount = 0, min_amount, expires;
-	int expires_seq;
+	u64 amount = 0, min_amount;
 
 	/* note: this is a positive sum as runtime_remaining <= 0 */
 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -5108,61 +5105,17 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 			cfs_b->idle = 0;
 		}
 	}
-	expires_seq = cfs_b->expires_seq;
-	expires = cfs_b->runtime_expires;
 	raw_spin_unlock(&cfs_b->lock);
 
 	cfs_rq->runtime_remaining += amount;
-	/*
-	 * we may have advanced our local expiration to account for allowed
-	 * spread between our sched_clock and the one on which runtime was
-	 * issued.
-	 */
-	if (cfs_rq->expires_seq != expires_seq) {
-		cfs_rq->expires_seq = expires_seq;
-		cfs_rq->runtime_expires = expires;
-	}
 
 	return cfs_rq->runtime_remaining > 0;
 }
 
-/*
- * Note: This depends on the synchronization provided by sched_clock and the
- * fact that rq->clock snapshots this value.
- */
-static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
-	/* if the deadline is ahead of our clock, nothing to do */
-	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
-		return;
-
-	if (cfs_rq->runtime_remaining < 0)
-		return;
-
-	/*
-	 * If the local deadline has passed we have to consider the
-	 * possibility that our sched_clock is 'fast' and the global deadline
-	 * has not truly expired.
-	 *
-	 * Fortunately we can check determine whether this the case by checking
-	 * whether the global deadline(cfs_b->expires_seq) has advanced.
-	 */
-	if (cfs_rq->expires_seq == cfs_b->expires_seq) {
-		/* extend local deadline, drift is bounded above by 2 ticks */
-		cfs_rq->runtime_expires += TICK_NSEC;
-	} else {
-		/* global deadline is ahead, expiration has passed */
-		cfs_rq->runtime_remaining = 0;
-	}
-}
-
 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 {
 	/* dock delta_exec before expiring quota (as it could span periods) */
 	cfs_rq->runtime_remaining -= delta_exec;
-	expire_cfs_rq_runtime(cfs_rq);
 
 	if (likely(cfs_rq->runtime_remaining > 0))
 		return;
@@ -5396,8 +5349,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 			     cpu_temp(cpu_of(rq)));
 }
 
-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
-		u64 remaining, u64 expires)
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
 {
 	struct cfs_rq *cfs_rq;
 	u64 runtime;
@@ -5418,7 +5370,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
 		remaining -= runtime;
 
 		cfs_rq->runtime_remaining += runtime;
-		cfs_rq->runtime_expires = expires;
 
 		/* we check whether we're throttled above */
 		if (cfs_rq->runtime_remaining > 0)
@@ -5443,7 +5394,7 @@ next:
  */
 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 {
-	u64 runtime, runtime_expires;
+	u64 runtime;
 	int throttled;
 
 	/* no need to continue the timer with no bandwidth constraint */
@@ -5471,8 +5422,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 	/* account preceding periods in which throttling occurred */
 	cfs_b->nr_throttled += overrun;
 
-	runtime_expires = cfs_b->runtime_expires;
-
 	/*
 	 * This check is repeated as we are holding onto the new bandwidth while
 	 * we unthrottle. This can potentially race with an unthrottled group
@@ -5485,8 +5434,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 		cfs_b->distribute_running = 1;
 		raw_spin_unlock(&cfs_b->lock);
 		/* we can't nest cfs_b->lock while distributing bandwidth */
-		runtime = distribute_cfs_runtime(cfs_b, runtime,
-						 runtime_expires);
+		runtime = distribute_cfs_runtime(cfs_b, runtime);
 		raw_spin_lock(&cfs_b->lock);
 
 		cfs_b->distribute_running = 0;
@@ -5563,8 +5511,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 		return;
 
 	raw_spin_lock(&cfs_b->lock);
-	if (cfs_b->quota != RUNTIME_INF &&
-	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+	if (cfs_b->quota != RUNTIME_INF) {
 		cfs_b->runtime += slack_runtime;
 
 		/* we are under rq->lock, defer unthrottling using a timer */
@@ -5596,7 +5543,6 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 {
 	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
-	u64 expires;
 
 	/* confirm we're still not at a refresh boundary */
 	raw_spin_lock(&cfs_b->lock);
@@ -5613,7 +5559,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
 		runtime = cfs_b->runtime;
 
-	expires = cfs_b->runtime_expires;
 	if (runtime)
 		cfs_b->distribute_running = 1;
 
@@ -5622,11 +5567,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 	if (!runtime)
 		return;
 
-	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+	runtime = distribute_cfs_runtime(cfs_b, runtime);
 
 	raw_spin_lock(&cfs_b->lock);
-	if (expires == cfs_b->runtime_expires)
-		cfs_b->runtime -= min(runtime, cfs_b->runtime);
+	cfs_b->runtime -= min(runtime, cfs_b->runtime);
 	cfs_b->distribute_running = 0;
 	raw_spin_unlock(&cfs_b->lock);
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eaf5d3af2e92..4e1afb33166b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -226,8 +226,6 @@ struct cfs_bandwidth {
 	ktime_t period;
 	u64 quota, runtime;
 	s64 hierarchical_quota;
-	u64 runtime_expires;
-	int expires_seq;
 
 	short idle, period_active;
 	struct hrtimer period_timer, slack_timer;
@@ -523,8 +521,6 @@ struct cfs_rq {
 #endif
 
 	int runtime_enabled;
-	int expires_seq;
-	u64 runtime_expires;
 	s64 runtime_remaining;
 
 	u64 throttled_clock, throttled_clock_task;
-- 
cgit v1.2.3


From 997b726bc092e29a9c6bf5f8925c98defc69a6cd Mon Sep 17 00:00:00 2001
From: Frank Luo <luofrank@google.com>
Date: Tue, 20 Nov 2018 15:33:34 +0800
Subject: kernel: power: Workaround for sensor ipc message causing high power
 consume

Sync from Qcom's document KBA-180725024109
To avoid the non-wakeup type sensor data break the AP sleep flow,
notify sensor subsystem in the first place of pm_suspend .

Bug: 118418963
Test: measure power consumption after running test case
Change-Id: I2848230d495e30ac462aef148b3f885103d9c24e
Signed-off-by: Frank Luo <luofrank@google.com>
---
 drivers/soc/qcom/smp2p_sleepstate.c | 5 ++---
 kernel/power/suspend.c              | 6 ++++++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/soc/qcom/smp2p_sleepstate.c b/drivers/soc/qcom/smp2p_sleepstate.c
index 2ef25e48ce50..1f0809b61220 100644
--- a/drivers/soc/qcom/smp2p_sleepstate.c
+++ b/drivers/soc/qcom/smp2p_sleepstate.c
@@ -20,7 +20,8 @@
 
 #define SET_DELAY (2 * HZ)
 #define PROC_AWAKE_ID 12 /* 12th bit */
-static int slst_gpio_base_id;
+int slst_gpio_base_id;
+
 
 /**
  * sleepstate_pm_notifier() - PM notifier callback function.
@@ -36,13 +37,11 @@ static int sleepstate_pm_notifier(struct notifier_block *nb,
 {
 	switch (event) {
 	case PM_SUSPEND_PREPARE:
-		gpio_set_value(slst_gpio_base_id + PROC_AWAKE_ID, 0);
 		msleep(25); /* To be tuned based on SMP2P latencies */
 		msm_ipc_router_set_ws_allowed(true);
 		break;
 
 	case PM_POST_SUSPEND:
-		gpio_set_value(slst_gpio_base_id + PROC_AWAKE_ID, 1);
 		msleep(25); /* To be tuned based on SMP2P latencies */
 		msm_ipc_router_set_ws_allowed(false);
 		break;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6e7832ee6d74..18c322fe8b73 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -34,6 +34,10 @@
 
 #include "power.h"
 
+#include <linux/gpio.h>
+extern int slst_gpio_base_id;
+#define PROC_AWAKE_ID 12 /* 12th bit */
+
 const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
 const char *pm_states[PM_SUSPEND_MAX];
 
@@ -563,7 +567,9 @@ int pm_suspend(suspend_state_t state)
 		return -EINVAL;
 
 	pm_suspend_marker("entry");
+	gpio_set_value(slst_gpio_base_id + PROC_AWAKE_ID, 0);
 	error = enter_state(state);
+	gpio_set_value(slst_gpio_base_id + PROC_AWAKE_ID, 1);
 	if (error) {
 		suspend_stats.fail++;
 		dpm_save_failed_errno(error);
-- 
cgit v1.2.3


From 217ab2d0ef91df5539f055c07a7890153c5ce9a2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 24 May 2018 18:49:46 -0400
Subject: rcu: Speed up calling of RCU tasks callbacks

Joel Fernandes found that the synchronize_rcu_tasks() was taking a
significant amount of time. He demonstrated it with the following test:

 # cd /sys/kernel/tracing
 # while [ 1 ]; do x=1; done &
 # echo '__schedule_bug:traceon' > set_ftrace_filter
 # time echo '!__schedule_bug:traceon' > set_ftrace_filter;

real	0m1.064s
user	0m0.000s
sys	0m0.004s

Where it takes a little over a second to perform the synchronize,
because there's a loop that waits 1 second at a time for tasks to get
through their quiescent points when there's a task that must be waited
for.

After discussion we came up with a simple way to wait for holdouts but
increase the time for each iteration of the loop but no more than a
full second.

With the new patch we have:

 # time echo '!__schedule_bug:traceon' > set_ftrace_filter;

real	0m0.131s
user	0m0.000s
sys	0m0.004s

Which drops it down to 13% of what the original wait time was.

Link: http://lkml.kernel.org/r/20180523063815.198302-2-joel@joelfernandes.org
Reported-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Suggested-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Change-Id: I40bcecdfdb2a1cdae7195f1d3b107455ea4b26b1
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/update.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 5f748c5a40f0..90fdf77dab7e 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -650,6 +650,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 	struct rcu_head *list;
 	struct rcu_head *next;
 	LIST_HEAD(rcu_tasks_holdouts);
+	int fract;
 
 	/* Run on housekeeping CPUs by default.  Sysadm can move if desired. */
 	housekeeping_affine(current);
@@ -731,13 +732,25 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 		 * holdouts.  When the list is empty, we are done.
 		 */
 		lastreport = jiffies;
-		while (!list_empty(&rcu_tasks_holdouts)) {
+
+		/* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/
+		fract = 10;
+
+		for (;;) {
 			bool firstreport;
 			bool needreport;
 			int rtst;
 			struct task_struct *t1;
 
-			schedule_timeout_interruptible(HZ);
+			if (list_empty(&rcu_tasks_holdouts))
+				break;
+
+			/* Slowly back off waiting for holdouts */
+			schedule_timeout_interruptible(HZ/fract);
+
+			if (fract > 1)
+				fract--;
+
 			rtst = READ_ONCE(rcu_task_stall_timeout);
 			needreport = rtst > 0 &&
 				     time_after(jiffies, lastreport + rtst);
-- 
cgit v1.2.3


From daaa5da96a74d35c64db2952add990213355ab4a Mon Sep 17 00:00:00 2001
From: Prasad Sodagudi <psodagud@codeaurora.org>
Date: Fri, 15 Mar 2019 11:52:48 -0700
Subject: sched: Take irq_sparse lock during the isolation

irq_migrate_all_off_this_cpu() is used to migrate IRQs and this
function checks for all active irq in the allocated_irqs mask.
irq_migrate_all_off_this_cpu() expects the caller to take irq_sparse
lock to avoid race conditions while accessing allocated_irqs
mask variable. Prevent a race between irq alloc/free and irq
migration by adding irq_sparse lock across CPU isolation.

Change-Id: I9edece1ecea45297c8f6529952d88b3133046467
Signed-off-by: Prasad Sodagudi <psodagud@codeaurora.org>
---
 kernel/sched/core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 17c13347d703..61a80b81afcf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6024,7 +6024,9 @@ int sched_isolate_cpu(int cpu)
 	smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1);
 	smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1);
 
+	irq_lock_sparse();
 	stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0);
+	irq_unlock_sparse();
 
 	calc_load_migrate(rq);
 	update_max_interval();
-- 
cgit v1.2.3


From 7d11b1a7a11c598a07687f853ded9eca97d89043 Mon Sep 17 00:00:00 2001
From: Georg Veichtlbauer <georg@vware.at>
Date: Wed, 26 Jul 2023 21:00:09 +0200
Subject: Revert "sched: cpufreq: Use sched_clock instead of rq_clock when
 updating schedutil"

That commit should have changed rq_clock to sched_clock, instead
of sched_ktime_clock, which kept schedutil from making correct
decisions.

This reverts commit ef3fb04c7df43dfa1793e33f764a2581cda96310.

Change-Id: Id4118894388c33bf2b2d3d5ee27eb35e82dc4a96
---
 kernel/sched/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4e1afb33166b..78ba150f2016 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2874,7 +2874,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
 		data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
 						cpu_of(rq)));
         if (data)
-                data->func(data, sched_ktime_clock(), flags);
+                data->func(data, rq_clock(rq), flags);
 }
 
 static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
-- 
cgit v1.2.3