From 0d7a6c301af8851542a9ec66a7dab571a979c057 Mon Sep 17 00:00:00 2001 From: Nagalakshmi Date: Tue, 29 Nov 2022 22:48:35 -0800 Subject: qcacld-3.0: Fix OOB in wma_scan_roam.c Currently in wma_extscan_hotlist_match_event_handler API, dest_hotlist get memory allocation based on numap which takes value from event->total_entries. But numap is limited to WMA_EXTSCAN_MAX_HOTLIST_ENTRIES and event->total_entries more than WMA_EXTSCAN_MAX_HOTLIST_ENTRIES can cause out of bound issue. Fix is to populate dest_hotlist->numOfAps from numap instead of event->total_entries to avoid any out of bound issue. Change-Id: I756f7e4a4dcd454508bba83d4a8bbbb139530905 CRs-Fixed: 3346781 --- drivers/staging/qcacld-3.0/core/wma/src/wma_scan_roam.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/qcacld-3.0/core/wma/src/wma_scan_roam.c b/drivers/staging/qcacld-3.0/core/wma/src/wma_scan_roam.c index f6b3c34c274e..99795c3d1785 100644 --- a/drivers/staging/qcacld-3.0/core/wma/src/wma_scan_roam.c +++ b/drivers/staging/qcacld-3.0/core/wma/src/wma_scan_roam.c @@ -5140,7 +5140,7 @@ int wma_extscan_hotlist_match_event_handler(void *handle, return -ENOMEM; } dest_ap = &dest_hotlist->ap[0]; - dest_hotlist->numOfAps = event->total_entries; + dest_hotlist->numOfAps = numap; dest_hotlist->requestId = event->config_request_id; if (event->first_entry_index + -- cgit v1.2.3 From d6038d6da57f766f4c9bb946a107e48617b414ff Mon Sep 17 00:00:00 2001 From: Soumya Managoli Date: Fri, 6 Jan 2023 14:37:20 +0530 Subject: ASoC: msm-pcm-q6-v2: Add dsp buf check Current logic copies user buf size of data from the avail dsp buf at a given offset. If this offset returned from DSP in READ_DONE event goes out of bounds or is corrupted, then it can lead to out of bounds DSP buffer access, resulting in memory fault. Fix is to add check for this buf offset, if it is within the buf size range. Change-Id: I7753cc6db394704dbb959477150141d42b836bef Signed-off-by: Soumya Managoli --- sound/soc/msm/qdsp6v2/msm-pcm-q6-v2.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sound/soc/msm/qdsp6v2/msm-pcm-q6-v2.c b/sound/soc/msm/qdsp6v2/msm-pcm-q6-v2.c index 487aaf2390c0..5f4225e675ad 100644 --- a/sound/soc/msm/qdsp6v2/msm-pcm-q6-v2.c +++ b/sound/soc/msm/qdsp6v2/msm-pcm-q6-v2.c @@ -1,4 +1,5 @@ /* Copyright (c) 2012-2020, The Linux Foundation. All rights reserved. + * Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -995,6 +996,14 @@ static int msm_pcm_capture_copy(struct snd_pcm_substream *substream, xfer = size; offset = prtd->in_frame_info[idx].offset; pr_debug("Offset value = %d\n", offset); + + if (offset >= size) { + pr_err("%s: Invalid dsp buf offset\n", __func__); + ret = -EFAULT; + q6asm_cpu_buf_release(OUT, prtd->audio_client); + goto fail; + } + if (copy_to_user(buf, bufptr+offset, xfer)) { pr_err("Failed to copy buf to user\n"); ret = -EFAULT; -- cgit v1.2.3 From 094b738f46c80c56d03d923c3e780e071abe34b0 Mon Sep 17 00:00:00 2001 From: Michael Bestas Date: Tue, 23 May 2023 18:44:59 +0300 Subject: power: qpnp-smb2: Implement battery charging_enabled node Change-Id: Id08c169f0c507390eab070d1ae77bfb992b50b81 --- drivers/power/supply/qcom/qpnp-smb2.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/power/supply/qcom/qpnp-smb2.c b/drivers/power/supply/qcom/qpnp-smb2.c index 5fae7b99d88f..b0704af49353 100644 --- a/drivers/power/supply/qcom/qpnp-smb2.c +++ b/drivers/power/supply/qcom/qpnp-smb2.c @@ -917,6 +917,7 @@ static int smb2_init_dc_psy(struct smb2 *chip) *************************/ static enum power_supply_property smb2_batt_props[] = { + POWER_SUPPLY_PROP_CHARGING_ENABLED, POWER_SUPPLY_PROP_INPUT_SUSPEND, POWER_SUPPLY_PROP_STATUS, POWER_SUPPLY_PROP_HEALTH, @@ -967,6 +968,9 @@ static int smb2_batt_get_prop(struct power_supply *psy, case POWER_SUPPLY_PROP_PRESENT: rc = smblib_get_prop_batt_present(chg, val); break; + case POWER_SUPPLY_PROP_CHARGING_ENABLED: + val->intval = !get_effective_result(chg->chg_disable_votable); + break; case POWER_SUPPLY_PROP_INPUT_SUSPEND: rc = smblib_get_prop_input_suspend(chg, val); break; @@ -1079,6 +1083,9 @@ static int smb2_batt_set_prop(struct power_supply *psy, struct smb_charger *chg = power_supply_get_drvdata(psy); switch (prop) { + case POWER_SUPPLY_PROP_CHARGING_ENABLED: + vote(chg->chg_disable_votable, USER_VOTER, !!!val->intval, 0); + break; case POWER_SUPPLY_PROP_INPUT_SUSPEND: rc = smblib_set_prop_input_suspend(chg, val); break; @@ -1163,6 +1170,7 @@ static int smb2_batt_prop_is_writeable(struct power_supply *psy, enum power_supply_property psp) { switch (psp) { + case POWER_SUPPLY_PROP_CHARGING_ENABLED: case POWER_SUPPLY_PROP_INPUT_SUSPEND: case POWER_SUPPLY_PROP_SYSTEM_TEMP_LEVEL: case POWER_SUPPLY_PROP_CAPACITY: -- cgit v1.2.3 From 07f7c9961c7cd0090dd1771f61245746af7fe1ea Mon Sep 17 00:00:00 2001 From: Umang Agrawal Date: Wed, 20 Jun 2018 14:48:05 +0530 Subject: power: smb-lib: Fix mutex acquisition deadlock on PD hard reset Mutex acquisition deadlock can happen while cancelling cc_dettach work during pd_hard_reset from the function usbin_plugin_hard_reset _locked on vbus rise which is called in the same lock context that we try to acquire in the cc_dettach work routine. Check if cc_dettach work is running during pd_hard_reset and use trylock instead of mutex_lock to prevent any deadlock if mutext is already held. Change-Id: I5530deb9e654d3d12ba1b4bc6876f36127a0d5a5 Signed-off-by: Umang Agrawal --- drivers/power/supply/qcom/smb-lib.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/power/supply/qcom/smb-lib.c b/drivers/power/supply/qcom/smb-lib.c index 81623c65ea8e..1782f23fafa7 100644 --- a/drivers/power/supply/qcom/smb-lib.c +++ b/drivers/power/supply/qcom/smb-lib.c @@ -4457,6 +4457,7 @@ static void rdstd_cc2_detach_work(struct work_struct *work) { int rc; u8 stat4, stat5; + bool lock = false; struct smb_charger *chg = container_of(work, struct smb_charger, rdstd_cc2_detach_work); @@ -4519,9 +4520,28 @@ static void rdstd_cc2_detach_work(struct work_struct *work) rc = smblib_masked_write(chg, TYPE_C_INTRPT_ENB_SOFTWARE_CTRL_REG, EXIT_SNK_BASED_ON_CC_BIT, 0); smblib_reg_block_restore(chg, cc2_detach_settings); - mutex_lock(&chg->lock); + + /* + * Mutex acquisition deadlock can happen while cancelling this work + * during pd_hard_reset from the function smblib_cc2_sink_removal_exit + * which is called in the same lock context that we try to acquire in + * this work routine. + * Check if this work is running during pd_hard_reset and use trylock + * instead of mutex_lock to prevent any deadlock if mutext is already + * held. + */ + if (chg->pd_hard_reset) { + if (mutex_trylock(&chg->lock)) + lock = true; + } else { + mutex_lock(&chg->lock); + lock = true; + } + smblib_usb_typec_change(chg); - mutex_unlock(&chg->lock); + + if (lock) + mutex_unlock(&chg->lock); return; rerun: -- cgit v1.2.3 From 1daa7ea39076e334a07ffb90f55ae33398b3477f Mon Sep 17 00:00:00 2001 From: Archana Sathyakumar Date: Wed, 26 Jul 2017 07:37:51 -0600 Subject: pinctrl: qcom: Update irq handle for GPIO pins Default handle_irq for tlmm irq chip is handle_edge_irq. For direct connect GPIOs, the handle_irq is not changed unlike non-direct connect GPIOs. This causes an interrupt storm for level trigger types as handle_edge_irq does not mask the interrupt within the function. Change this to handle_fasteoi_irq such that both level and edge interrupts are handled correctly. Change-Id: I79f0d4d92145f85a8043875301400ecf36b46c7b Signed-off-by: Archana Sathyakumar --- drivers/pinctrl/qcom/pinctrl-msm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c index 22496ad167a0..d4a1f5378ac5 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm.c +++ b/drivers/pinctrl/qcom/pinctrl-msm.c @@ -905,7 +905,7 @@ static int msm_gpio_init(struct msm_pinctrl *pctrl) ret = gpiochip_irqchip_add(chip, &msm_gpio_irq_chip, 0, - handle_edge_irq, + handle_fasteoi_irq, IRQ_TYPE_NONE); if (ret) { dev_err(pctrl->dev, "Failed to add irqchip to gpiochip\n"); -- cgit v1.2.3 From 400383059868487869772b1c68ab8db4b6c81cbb Mon Sep 17 00:00:00 2001 From: Prasad Sodagudi Date: Mon, 24 Sep 2018 16:25:55 -0700 Subject: kernel: time: Add delay after cpu_relax() in tight loops Tight loops of spin_lock_irqsave() and spin_unlock_irqrestore() in timer and hrtimer are causing scheduling delays. Add delay of few nano seconds after cpu_relax in the timer/hrtimer tight loops. Change-Id: Iaa0ab92da93f7b245b1d922b6edca2bebdc0fbce Signed-off-by: Prasad Sodagudi --- include/linux/time.h | 1 + kernel/time/alarmtimer.c | 2 ++ kernel/time/hrtimer.c | 3 +++ kernel/time/timer.c | 2 ++ 4 files changed, 8 insertions(+) diff --git a/include/linux/time.h b/include/linux/time.h index 62cc50700004..cbb55e004342 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -9,6 +9,7 @@ extern struct timezone sys_tz; #define TIME_T_MAX (time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1) +#define TIMER_LOCK_TIGHT_LOOP_DELAY_NS 350 static inline int timespec_equal(const struct timespec *a, const struct timespec *b) diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 4171fee2d4ec..612c97156df7 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -25,6 +25,7 @@ #include #include #include +#include #ifdef CONFIG_MSM_PM #include "lpm-levels.h" @@ -503,6 +504,7 @@ int alarm_cancel(struct alarm *alarm) if (ret >= 0) return ret; cpu_relax(); + ndelay(TIMER_LOCK_TIGHT_LOOP_DELAY_NS); } } EXPORT_SYMBOL_GPL(alarm_cancel); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 79fadcad21ff..6bd4247198e2 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -49,6 +49,7 @@ #include #include #include +#include #include @@ -156,6 +157,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); } cpu_relax(); + ndelay(TIMER_LOCK_TIGHT_LOOP_DELAY_NS); } } @@ -1061,6 +1063,7 @@ int hrtimer_cancel(struct hrtimer *timer) if (ret >= 0) return ret; cpu_relax(); + ndelay(TIMER_LOCK_TIGHT_LOOP_DELAY_NS); } } EXPORT_SYMBOL_GPL(hrtimer_cancel); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 67646a316436..2d8b82d90c9f 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -798,6 +798,7 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, spin_unlock_irqrestore(&base->lock, *flags); } cpu_relax(); + ndelay(TIMER_LOCK_TIGHT_LOOP_DELAY_NS); } } @@ -1148,6 +1149,7 @@ int del_timer_sync(struct timer_list *timer) if (ret >= 0) return ret; cpu_relax(); + ndelay(TIMER_LOCK_TIGHT_LOOP_DELAY_NS); } } EXPORT_SYMBOL(del_timer_sync); -- cgit v1.2.3 From 58c453484f7ef1b81d9da4a5696e849a92e61883 Mon Sep 17 00:00:00 2001 From: Giovanni Gherdovich Date: Fri, 5 Aug 2016 10:21:56 +0200 Subject: sched/cputime: Mitigate performance regression in times()/clock_gettime() Commit: 6e998916dfe3 ("sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency") fixed a problem whereby clock_nanosleep() followed by clock_gettime() could allow a task to wake early. It addressed the problem by calling the scheduling classes update_curr() when the cputimer starts. Said change induced a considerable performance regression on the syscalls times() and clock_gettimes(CLOCK_PROCESS_CPUTIME_ID). There are some debuggers and applications that monitor their own performance that accidentally depend on the performance of these specific calls. This patch mitigates the performace loss by prefetching data in the CPU cache, as stalls due to cache misses appear to be where most time is spent in our benchmarks. Here are the performance gain of this patch over v4.7-rc7 on a Sandy Bridge box with 32 logical cores and 2 NUMA nodes. The test is repeated with a variable number of threads, from 2 to 4*num_cpus; the results are in seconds and correspond to the average of 10 runs; the percentage gain is computed with (before-after)/before so a positive value is an improvement (it's faster). The improvement varies between a few percents for 5-20 threads and more than 10% for 2 or >20 threads. pound_clock_gettime: threads 4.7-rc7 patched 4.7-rc7 [num] [secs] [secs (percent)] 2 3.48 3.06 ( 11.83%) 5 3.33 3.25 ( 2.40%) 8 3.37 3.26 ( 3.30%) 12 3.32 3.37 ( -1.60%) 21 4.01 3.90 ( 2.74%) 30 3.63 3.36 ( 7.41%) 48 3.71 3.11 ( 16.27%) 79 3.75 3.16 ( 15.74%) 110 3.81 3.25 ( 14.80%) 128 3.88 3.31 ( 14.76%) pound_times: threads 4.7-rc7 patched 4.7-rc7 [num] [secs] [secs (percent)] 2 3.65 3.25 ( 11.03%) 5 3.45 3.17 ( 7.92%) 8 3.52 3.22 ( 8.69%) 12 3.29 3.36 ( -2.04%) 21 4.07 3.92 ( 3.78%) 30 3.87 3.40 ( 12.17%) 48 3.79 3.16 ( 16.61%) 79 3.88 3.28 ( 15.42%) 110 3.90 3.38 ( 13.35%) 128 4.00 3.38 ( 15.45%) pound_clock_gettime and pound_clock_gettime are two benchmarks included in the MMTests framework. They launch a given number of threads which repeatedly call times() or clock_gettimes(). The results above can be reproduced with cloning MMTests from github.com and running the "poundtime" workload: $ git clone https://github.com/gormanm/mmtests.git $ cd mmtests $ cp configs/config-global-dhp__workload_poundtime config $ ./run-mmtests.sh --run-monitor $(uname -r) The above will run "poundtime" measuring the kernel currently running on the machine; Once a new kernel is installed and the machine rebooted, running again $ cd mmtests $ ./run-mmtests.sh --run-monitor $(uname -r) will produce results to compare with. A comparison table will be output with: $ cd mmtests/work/log $ ../../compare-kernels.sh the table will contain a lot of entries; grepping for "Amean" (as in "arithmetic mean") will give the tables presented above. The source code for the two benchmarks is reported at the end of this changelog for clairity. The cache misses addressed by this patch were found using a combination of `perf top`, `perf record` and `perf annotate`. The incriminated lines were found to be struct sched_entity *curr = cfs_rq->curr; and delta_exec = now - curr->exec_start; in the function update_curr() from kernel/sched/fair.c. This patch prefetches the data from memory just before update_curr is called in the interested execution path. A comparison of the total number of cycles before and after the patch follows; the data is obtained using `perf stat -r 10 -ddd ` running over the same sequence of number of threads used above (a positive gain is an improvement): threads cycles before cycles after gain 2 19,699,563,964 +-1.19% 17,358,917,517 +-1.85% 11.88% 5 47,401,089,566 +-2.96% 45,103,730,829 +-0.97% 4.85% 8 80,923,501,004 +-3.01% 71,419,385,977 +-0.77% 11.74% 12 112,326,485,473 +-0.47% 110,371,524,403 +-0.47% 1.74% 21 193,455,574,299 +-0.72% 180,120,667,904 +-0.36% 6.89% 30 315,073,519,013 +-1.64% 271,222,225,950 +-1.29% 13.92% 48 321,969,515,332 +-1.48% 273,353,977,321 +-1.16% 15.10% 79 337,866,003,422 +-0.97% 289,462,481,538 +-1.05% 14.33% 110 338,712,691,920 +-0.78% 290,574,233,170 +-0.77% 14.21% 128 348,384,794,006 +-0.50% 292,691,648,206 +-0.66% 15.99% A comparison of cache miss vs total cache loads ratios, before and after the patch (again from the `perf stat -r 10 -ddd ` tables): threads L1 misses/total*100 L1 misses/total*100 gain before after 2 7.43 +-4.90% 7.36 +-4.70% 0.94% 5 13.09 +-4.74% 13.52 +-3.73% -3.28% 8 13.79 +-5.61% 12.90 +-3.27% 6.45% 12 11.57 +-2.44% 8.71 +-1.40% 24.72% 21 12.39 +-3.92% 9.97 +-1.84% 19.53% 30 13.91 +-2.53% 11.73 +-2.28% 15.67% 48 13.71 +-1.59% 12.32 +-1.97% 10.14% 79 14.44 +-0.66% 13.40 +-1.06% 7.20% 110 15.86 +-0.50% 14.46 +-0.59% 8.83% 128 16.51 +-0.32% 15.06 +-0.78% 8.78% As a final note, the following shows the evolution of performance figures in the "poundtime" benchmark and pinpoints commit 6e998916dfe3 ("sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency") as a major source of degradation, mostly unaddressed to this day (figures expressed in seconds). pound_clock_gettime: threads parent of 6e998916dfe3 4.7-rc7 6e998916dfe3 itself 2 2.23 3.68 ( -64.56%) 3.48 (-55.48%) 5 2.83 3.78 ( -33.42%) 3.33 (-17.43%) 8 2.84 4.31 ( -52.12%) 3.37 (-18.76%) 12 3.09 3.61 ( -16.74%) 3.32 ( -7.17%) 21 3.14 4.63 ( -47.36%) 4.01 (-27.71%) 30 3.28 5.75 ( -75.37%) 3.63 (-10.80%) 48 3.02 6.05 (-100.56%) 3.71 (-22.99%) 79 2.88 6.30 (-118.90%) 3.75 (-30.26%) 110 2.95 6.46 (-119.00%) 3.81 (-29.24%) 128 3.05 6.42 (-110.08%) 3.88 (-27.04%) pound_times: threads parent of 6e998916dfe3 4.7-rc7 6e998916dfe3 itself 2 2.27 3.73 ( -64.71%) 3.65 (-61.14%) 5 2.78 3.77 ( -35.56%) 3.45 (-23.98%) 8 2.79 4.41 ( -57.71%) 3.52 (-26.05%) 12 3.02 3.56 ( -17.94%) 3.29 ( -9.08%) 21 3.10 4.61 ( -48.74%) 4.07 (-31.34%) 30 3.33 5.75 ( -72.53%) 3.87 (-16.01%) 48 2.96 6.06 (-105.04%) 3.79 (-28.10%) 79 2.88 6.24 (-116.83%) 3.88 (-34.81%) 110 2.98 6.37 (-114.08%) 3.90 (-31.12%) 128 3.10 6.35 (-104.61%) 4.00 (-28.87%) The source code of the two benchmarks follows. To compile the two: NR_THREADS=42 for FILE in pound_times pound_clock_gettime; do gcc -lrt -O2 -lpthread -DNUM_THREADS=$NR_THREADS $FILE.c -o $FILE done ==== BEGIN pound_times.c ==== struct tms start; void *pound (void *threadid) { struct tms end; int oldutime = 0; int utime; int i; for (i = 0; i < 5000000 / NUM_THREADS; i++) { times(&end); utime = ((int)end.tms_utime - (int)start.tms_utime); if (oldutime > utime) { printf("utime decreased, was %d, now %d!\n", oldutime, utime); } oldutime = utime; } pthread_exit(NULL); } int main() { pthread_t th[NUM_THREADS]; long i; times(&start); for (i = 0; i < NUM_THREADS; i++) { pthread_create (&th[i], NULL, pound, (void *)i); } pthread_exit(NULL); return 0; } ==== END pound_times.c ==== ==== BEGIN pound_clock_gettime.c ==== void *pound (void *threadid) { struct timespec ts; int rc, i; unsigned long prev = 0, this = 0; for (i = 0; i < 5000000 / NUM_THREADS; i++) { rc = clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts); if (rc < 0) perror("clock_gettime"); this = (ts.tv_sec * 1000000000) + ts.tv_nsec; if (0 && this < prev) printf("%lu ns timewarp at iteration %d\n", prev - this, i); prev = this; } pthread_exit(NULL); } int main() { pthread_t th[NUM_THREADS]; long rc, i; pid_t pgid; for (i = 0; i < NUM_THREADS; i++) { rc = pthread_create(&th[i], NULL, pound, (void *)i); if (rc < 0) perror("pthread_create"); } pthread_exit(NULL); return 0; } ==== END pound_clock_gettime.c ==== Suggested-by: Mike Galbraith Change-Id: Iad82d9f31c92e50e1e3b1339892512526ceb0acf Signed-off-by: Giovanni Gherdovich Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Zijlstra Cc: Stanislaw Gruszka Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1470385316-15027-2-git-send-email-ggherdovich@suse.cz Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6c3f46e759d2..18afb0fe2704 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -78,6 +78,7 @@ #include #include #include +#include #include #include @@ -3133,6 +3134,23 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kernel_cpustat); +/* + * The function fair_sched_class.update_curr accesses the struct curr + * and its field curr->exec_start; when called from task_sched_runtime(), + * we observe a high rate of cache misses in practice. + * Prefetching this data results in improved performance. + */ +static inline void prefetch_curr_exec_start(struct task_struct *p) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity *curr = (&p->se)->cfs_rq->curr; +#else + struct sched_entity *curr = (&task_rq(p)->cfs)->curr; +#endif + prefetch(curr); + prefetch(&curr->exec_start); +} + /* * Return accounted runtime for the task. * In case the task is currently running, return the runtime plus current's @@ -3167,6 +3185,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) * thread, breaking clock_gettime(). */ if (task_current(rq, p) && task_on_rq_queued(p)) { + prefetch_curr_exec_start(p); update_rq_clock(rq); p->sched_class->update_curr(rq); } -- cgit v1.2.3 From f31078b5825f71499fa95b85babf6ac8c776c37d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Jun 2017 01:37:38 +0200 Subject: genirq: Introduce effective affinity mask There is currently no way to evaluate the effective affinity mask of a given interrupt. Many irq chips allow only a single target CPU or a subset of CPUs in the affinity mask. Updating the mask at the time of setting the affinity to the subset would be counterproductive because information for cpu hotplug about assigned interrupt affinities gets lost. On CPU hotplug it's also pointless to force migrate an interrupt, which is not targeted at the CPU effectively. But currently the information is not available. Provide a seperate mask to be updated by the irq_chip->irq_set_affinity() implementations. Implement the read only proc files so the user can see the effective mask as well w/o trying to deduce it from /proc/interrupts. Signed-off-by: Thomas Gleixner Cc: Jens Axboe Cc: Marc Zyngier Cc: Michael Ellerman Cc: Keith Busch Cc: Peter Zijlstra Cc: Christoph Hellwig Link: http://lkml.kernel.org/r/20170619235446.247834245@linutronix.de Change-Id: Ibeec0031edb532d52cb411286f785aec160d6139 --- include/linux/irq.h | 29 +++++++++++++++++ kernel/irq/Kconfig | 4 +++ kernel/irq/irqdesc.c | 14 ++++++++ kernel/irq/proc.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 130 insertions(+), 7 deletions(-) diff --git a/include/linux/irq.h b/include/linux/irq.h index 8da001eb82aa..0e57f41bde84 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -136,6 +136,9 @@ struct irq_domain; * @node: node index useful for balancing * @handler_data: per-IRQ data for the irq_chip methods * @affinity: IRQ affinity on SMP + * @effective_affinity: The effective IRQ affinity on SMP as some irq + * chips do not allow multi CPU destinations. + * A subset of @affinity. * @msi_desc: MSI descriptor */ struct irq_common_data { @@ -146,6 +149,9 @@ struct irq_common_data { void *handler_data; struct msi_desc *msi_desc; cpumask_var_t affinity; +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + cpumask_var_t effective_affinity; +#endif }; /** @@ -690,6 +696,29 @@ static inline struct cpumask *irq_data_get_affinity_mask(struct irq_data *d) return d->common->affinity; } +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK +static inline +struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) +{ + return d->common->effective_affinity; +} +static inline void irq_data_update_effective_affinity(struct irq_data *d, + const struct cpumask *m) +{ + cpumask_copy(d->common->effective_affinity, m); +} +#else +static inline void irq_data_update_effective_affinity(struct irq_data *d, + const struct cpumask *m) +{ +} +static inline +struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d) +{ + return d->common->affinity; +} +#endif + unsigned int arch_dynirq_lower_bound(unsigned int from); int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 3b48dab80164..5d00ba9af4ec 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -21,6 +21,10 @@ config GENERIC_IRQ_SHOW config GENERIC_IRQ_SHOW_LEVEL bool +# Supports effective affinity mask +config GENERIC_IRQ_EFFECTIVE_AFF_MASK + bool + # Facility to allocate a hardware interrupt. This is legacy support # and should not be used in new code. Use irq domains instead. config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 52fbf88cd2d8..e0de4682f57a 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -43,8 +43,19 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) gfp, node)) return -ENOMEM; +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + if (!zalloc_cpumask_var_node(&desc->irq_common_data.effective_affinity, + GFP_KERNEL, node)) { + free_cpumask_var(desc->irq_common_data.affinity); + return -ENOMEM; + } +#endif + #ifdef CONFIG_GENERIC_PENDING_IRQ if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + free_cpumask_var(desc->irq_common_data.effective_affinity); +#endif free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } @@ -127,6 +138,9 @@ static void free_masks(struct irq_desc *desc) free_cpumask_var(desc->pending_mask); #endif free_cpumask_var(desc->irq_common_data.affinity); +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + free_cpumask_var(desc->irq_common_data.effective_affinity); +#endif } #else static inline void free_masks(struct irq_desc *desc) { } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index b05509af0352..9da4f2b075d1 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -37,19 +37,47 @@ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP +enum { + AFFINITY, + AFFINITY_LIST, + EFFECTIVE, + EFFECTIVE_LIST, +}; + static int show_irq_affinity(int type, struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - const struct cpumask *mask = desc->irq_common_data.affinity; + const struct cpumask *mask; + switch (type) { + case AFFINITY: + case AFFINITY_LIST: + mask = desc->irq_common_data.affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ - if (irqd_is_setaffinity_pending(&desc->irq_data)) - mask = desc->pending_mask; + if (irqd_is_setaffinity_pending(&desc->irq_data)) + mask = desc->pending_mask; #endif - if (type) + break; + case EFFECTIVE: + case EFFECTIVE_LIST: +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + mask = desc->irq_common_data.effective_affinity; + break; +#else + return -EINVAL; +#endif + }; + + switch (type) { + case AFFINITY_LIST: + case EFFECTIVE_LIST: seq_printf(m, "%*pbl\n", cpumask_pr_args(mask)); - else + break; + case AFFINITY: + case EFFECTIVE: seq_printf(m, "%*pb\n", cpumask_pr_args(mask)); + break; + } return 0; } @@ -80,12 +108,12 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) int no_irq_affinity; static int irq_affinity_proc_show(struct seq_file *m, void *v) { - return show_irq_affinity(0, m, v); + return show_irq_affinity(AFFINITY, m, v); } static int irq_affinity_list_proc_show(struct seq_file *m, void *v) { - return show_irq_affinity(1, m, v); + return show_irq_affinity(AFFINITY_LIST, m, v); } @@ -188,6 +216,44 @@ static const struct file_operations irq_affinity_list_proc_fops = { .write = irq_affinity_list_proc_write, }; +#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK +static int irq_effective_aff_proc_show(struct seq_file *m, void *v) +{ + return show_irq_affinity(EFFECTIVE, m); +} + +static int irq_effective_aff_list_proc_show(struct seq_file *m, void *v) +{ + return show_irq_affinity(EFFECTIVE_LIST, m); +} + +static int irq_effective_aff_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_effective_aff_proc_show, PDE_DATA(inode)); +} + +static int irq_effective_aff_list_proc_open(struct inode *inode, + struct file *file) +{ + return single_open(file, irq_effective_aff_list_proc_show, + PDE_DATA(inode)); +} + +static const struct file_operations irq_effective_aff_proc_fops = { + .open = irq_effective_aff_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations irq_effective_aff_list_proc_fops = { + .open = irq_effective_aff_list_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + static int default_affinity_show(struct seq_file *m, void *v) { seq_printf(m, "%*pb\n", cpumask_pr_args(irq_default_affinity)); @@ -368,6 +434,12 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) proc_create_data("node", 0444, desc->dir, &irq_node_proc_fops, (void *)(long)irq); +# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + proc_create_data("effective_affinity", 0444, desc->dir, + &irq_effective_aff_proc_fops, (void *)(long)irq); + proc_create_data("effective_affinity_list", 0444, desc->dir, + &irq_effective_aff_list_proc_fops, (void *)(long)irq); +# endif #endif proc_create_data("spurious", 0444, desc->dir, @@ -388,6 +460,10 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) remove_proc_entry("affinity_hint", desc->dir); remove_proc_entry("smp_affinity_list", desc->dir); remove_proc_entry("node", desc->dir); +# ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK + remove_proc_entry("effective_affinity", desc->dir); + remove_proc_entry("effective_affinity_list", desc->dir); +# endif #endif remove_proc_entry("spurious", desc->dir); -- cgit v1.2.3 From 9209b5556f6acc6b2c0c29135db247f90a3eb78e Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Tue, 4 Dec 2018 12:04:59 +0530 Subject: power: qos: Use effective affinity mask PM_QOS_REQ_AFFINE_IRQ request is supposed to apply the QoS vote for the CPU(s) on which the attached interrupt arrives. Currently the QoS vote is applied to all the CPUs present in the IRQ affinity mask i.e desc->irq_data.common->affinity. However some chips configure only a single CPU from this affinity mask to receive the IRQ. This information is present in effective affinity mask of an IRQ. Start using it so that a QoS vote is not applied to other CPUs on which the IRQ never comes but present in the affinity mask. Change-Id: If26aa23bebe4a7d07ffedb5ff833ccdb4f4fb6ea Signed-off-by: Pavankumar Kondeti --- kernel/power/qos.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/kernel/power/qos.c b/kernel/power/qos.c index e6eceb0aa496..49dc710d4a3a 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -545,19 +545,29 @@ static void pm_qos_irq_release(struct kref *ref) } static void pm_qos_irq_notify(struct irq_affinity_notify *notify, - const cpumask_t *mask) + const cpumask_t *unused_mask) { unsigned long flags; struct pm_qos_request *req = container_of(notify, struct pm_qos_request, irq_notify); struct pm_qos_constraints *c = pm_qos_array[req->pm_qos_class]->constraints; + struct irq_desc *desc = irq_to_desc(req->irq); + struct cpumask *new_affinity = + irq_data_get_effective_affinity_mask(&desc->irq_data); + bool affinity_changed = false; spin_lock_irqsave(&pm_qos_lock, flags); - cpumask_copy(&req->cpus_affine, mask); + if (!cpumask_equal(&req->cpus_affine, new_affinity)) { + cpumask_copy(&req->cpus_affine, new_affinity); + affinity_changed = true; + } + spin_unlock_irqrestore(&pm_qos_lock, flags); - pm_qos_update_target(c, req, PM_QOS_UPDATE_REQ, req->node.prio); + if (affinity_changed) + pm_qos_update_target(c, req, PM_QOS_UPDATE_REQ, + req->node.prio); } #endif @@ -601,9 +611,17 @@ void pm_qos_add_request(struct pm_qos_request *req, if (!desc) return; - mask = desc->irq_data.common->affinity; - /* Get the current affinity */ + /* + * If the IRQ is not started, the effective affinity + * won't be set. So fallback to the default affinity. + */ + mask = irq_data_get_effective_affinity_mask( + &desc->irq_data); + if (cpumask_empty(mask)) + mask = irq_data_get_affinity_mask( + &desc->irq_data); + cpumask_copy(&req->cpus_affine, mask); req->irq_notify.irq = req->irq; req->irq_notify.notify = pm_qos_irq_notify; -- cgit v1.2.3 From 602bf4096dabb119eb3e176353c4607030fbb1c7 Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Mon, 3 Dec 2018 15:05:26 +0530 Subject: genirq: Honour IRQ's affinity hint during migration An IRQ affinity is broken during hotplug/isolation when there are no online and un-isolated CPUs in the current affinity mask. An online and un-isolated CPU from the irq_default_affinity mask (i.e /proc/irq/default_smp_affinity) is used as the current affinity. However Individual IRQs can have their affinity hint set via irq_set_affinity_hint() API. When such hint is available, use it instead of the irq_default_affinity which is a system level setting. Change-Id: I53a537582ec4e1aed0c59b49f4fd5b6ca7c0c332 Signed-off-by: Pavankumar Kondeti --- kernel/irq/cpuhotplug.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 4684b7595e63..9fad2dc50452 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -36,6 +36,10 @@ static bool migrate_one_irq(struct irq_desc *desc) affinity = &available_cpus; if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + const struct cpumask *default_affinity; + + default_affinity = desc->affinity_hint ? : irq_default_affinity; + /* * The order of preference for selecting a fallback CPU is * @@ -45,9 +49,9 @@ static bool migrate_one_irq(struct irq_desc *desc) */ cpumask_andnot(&available_cpus, cpu_online_mask, cpu_isolated_mask); - if (cpumask_intersects(&available_cpus, irq_default_affinity)) + if (cpumask_intersects(&available_cpus, default_affinity)) cpumask_and(&available_cpus, &available_cpus, - irq_default_affinity); + default_affinity); else if (cpumask_empty(&available_cpus)) affinity = cpu_online_mask; -- cgit v1.2.3 From 13e66175965635ed74c948f232a32033230cc5d0 Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Thu, 28 Feb 2019 10:40:39 +0530 Subject: cpuset: Restore tasks affinity while moving across cpusets When tasks move across cpusets, the current affinity settings are lost. Cache the task affinity and restore it during cpuset migration. The restoring happens only when the cached affinity is subset of the current cpuset settings. Change-Id: I6c2ec1d5e3d994e176926d94b9e0cc92418020cc Signed-off-by: Pavankumar Kondeti --- include/linux/init_task.h | 1 + include/linux/sched.h | 1 + kernel/cpuset.c | 18 ++++++++++++++++-- kernel/sched/core.c | 4 ++++ 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 021b1e9ff6cd..8aed56931361 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -208,6 +208,7 @@ extern struct task_group root_task_group; .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .nr_cpus_allowed= NR_CPUS, \ + .cpus_requested = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ .restart_block = { \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 70c1f7f9e4fa..9cb6964d178e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1700,6 +1700,7 @@ struct task_struct { unsigned int policy; int nr_cpus_allowed; cpumask_t cpus_allowed; + cpumask_t cpus_requested; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f3e6608313a2..a0bf3a7ce550 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -852,6 +852,20 @@ void rebuild_sched_domains(void) put_online_cpus(); } +static int update_cpus_allowed(struct cpuset *cs, struct task_struct *p, + const struct cpumask *new_mask) +{ + int ret; + + if (cpumask_subset(&p->cpus_requested, cs->cpus_requested)) { + ret = set_cpus_allowed_ptr(p, &p->cpus_requested); + if (!ret) + return ret; + } + + return set_cpus_allowed_ptr(p, new_mask); +} + /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed @@ -867,7 +881,7 @@ static void update_tasks_cpumask(struct cpuset *cs) css_task_iter_start(&cs->css, &it); while ((task = css_task_iter_next(&it))) - set_cpus_allowed_ptr(task, cs->effective_cpus); + update_cpus_allowed(cs, task, cs->effective_cpus); css_task_iter_end(&it); } @@ -1556,7 +1570,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here */ - WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); + WARN_ON_ONCE(update_cpus_allowed(cs, task, cpus_attach)); cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, task); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 18afb0fe2704..b33433586774 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4921,6 +4921,9 @@ again: retval = -EINVAL; } + if (!retval && !(p->flags & PF_KTHREAD)) + cpumask_and(&p->cpus_requested, in_mask, cpu_possible_mask); + out_free_new_mask: free_cpumask_var(new_mask); out_free_cpus_allowed: @@ -8344,6 +8347,7 @@ void __init sched_init_smp(void) /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) BUG(); + cpumask_copy(¤t->cpus_requested, cpu_possible_mask); sched_init_granularity(); free_cpumask_var(non_isolated_cpus); -- cgit v1.2.3 From 12af218146a6c1f3b2b3cec48b14076131f8ecdb Mon Sep 17 00:00:00 2001 From: Adrian Salido Date: Tue, 16 May 2017 20:08:29 -0700 Subject: msm: mdss: add idle state node Add a helper node that can be used to notify user space through sysfs node when fb device has not had any activity for a specified amount of time (through idle_time node). Bug: 62110101 Change-Id: I4dfa4b1a376149aa55a940dad7ac336ec99f1af8 Signed-off-by: Adrian Salido --- drivers/video/fbdev/msm/mdss_fb.c | 45 +++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/drivers/video/fbdev/msm/mdss_fb.c b/drivers/video/fbdev/msm/mdss_fb.c index 530da7af1866..64f86084b01d 100644 --- a/drivers/video/fbdev/msm/mdss_fb.c +++ b/drivers/video/fbdev/msm/mdss_fb.c @@ -520,12 +520,17 @@ static void __mdss_fb_idle_notify_work(struct work_struct *work) /* Notify idle-ness here */ pr_debug("Idle timeout %dms expired!\n", mfd->idle_time); - if (mfd->idle_time) - sysfs_notify(&mfd->fbi->dev->kobj, NULL, "idle_notify"); + mfd->idle_state = MDSS_FB_IDLE; + /* + * idle_notify node events are used to reduce MDP load when idle, + * this is not needed for command mode panels. + */ + if (mfd->idle_time && mfd->panel.type != MIPI_CMD_PANEL) + sysfs_notify(&mfd->fbi->dev->kobj, NULL, "idle_notify"); + sysfs_notify(&mfd->fbi->dev->kobj, NULL, "idle_state"); } - static ssize_t mdss_fb_get_fps_info(struct device *dev, struct device_attribute *attr, char *buf) { @@ -586,6 +591,26 @@ static ssize_t mdss_fb_get_idle_notify(struct device *dev, return ret; } +static ssize_t mdss_fb_get_idle_state(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct fb_info *fbi = dev_get_drvdata(dev); + struct msm_fb_data_type *mfd = fbi->par; + const char *state_strs[] = { + [MDSS_FB_NOT_IDLE] = "active", + [MDSS_FB_IDLE_TIMER_RUNNING] = "pending", + [MDSS_FB_IDLE] = "idle", + }; + int state = mfd->idle_state; + const char *s; + if (state < ARRAY_SIZE(state_strs) && state_strs[state]) + s = state_strs[state]; + else + s = "invalid"; + + return scnprintf(buf, PAGE_SIZE, "%s\n", s); +} + static ssize_t mdss_fb_get_panel_info(struct device *dev, struct device_attribute *attr, char *buf) { @@ -922,6 +947,7 @@ static DEVICE_ATTR(show_blank_event, S_IRUGO, mdss_mdp_show_blank_event, NULL); static DEVICE_ATTR(idle_time, S_IRUGO | S_IWUSR | S_IWGRP, mdss_fb_get_idle_time, mdss_fb_set_idle_time); static DEVICE_ATTR(idle_notify, S_IRUGO, mdss_fb_get_idle_notify, NULL); +static DEVICE_ATTR(idle_state, S_IRUGO, mdss_fb_get_idle_state, NULL); static DEVICE_ATTR(msm_fb_panel_info, S_IRUGO, mdss_fb_get_panel_info, NULL); static DEVICE_ATTR(msm_fb_src_split_info, S_IRUGO, mdss_fb_get_src_split_info, NULL); @@ -943,6 +969,7 @@ static struct attribute *mdss_fb_attrs[] = { &dev_attr_show_blank_event.attr, &dev_attr_idle_time.attr, &dev_attr_idle_notify.attr, + &dev_attr_idle_state.attr, &dev_attr_msm_fb_panel_info.attr, &dev_attr_msm_fb_src_split_info.attr, &dev_attr_msm_fb_thermal_level.attr, @@ -3136,14 +3163,18 @@ static int __mdss_fb_sync_buf_done_callback(struct notifier_block *p, ret = __mdss_fb_wait_for_fence_sub(sync_pt_data, sync_pt_data->temp_fen, fence_cnt); } - if (mfd->idle_time && !mod_delayed_work(system_wq, + if (mfd->idle_time) { + if (!mod_delayed_work(system_wq, &mfd->idle_notify_work, msecs_to_jiffies(mfd->idle_time))) - pr_debug("fb%d: restarted idle work\n", - mfd->index); + pr_debug("fb%d: restarted idle work\n", + mfd->index); + mfd->idle_state = MDSS_FB_IDLE_TIMER_RUNNING; + } else { + mfd->idle_state = MDSS_FB_IDLE; + } if (ret == -ETIME) ret = NOTIFY_BAD; - mfd->idle_state = MDSS_FB_IDLE_TIMER_RUNNING; break; case MDP_NOTIFY_FRAME_FLUSHED: pr_debug("%s: frame flushed\n", sync_pt_data->fence_name); -- cgit v1.2.3 From 620df03a7229bd2e13cdd32c3b56babc5b40b797 Mon Sep 17 00:00:00 2001 From: Georg Veichtlbauer Date: Mon, 26 Jun 2023 13:33:34 +0200 Subject: msm: mdss: Treat polling_en as the bool that it is Change-Id: Ifaa68915b52a0d6b54a5f80576ae65ba527a6c16 --- drivers/video/fbdev/msm/mdss_mdp_intf_video.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/msm/mdss_mdp_intf_video.c b/drivers/video/fbdev/msm/mdss_mdp_intf_video.c index 3761fa4af0eb..caa910db508c 100644 --- a/drivers/video/fbdev/msm/mdss_mdp_intf_video.c +++ b/drivers/video/fbdev/msm/mdss_mdp_intf_video.c @@ -1236,7 +1236,7 @@ static int mdss_mdp_video_wait4comp(struct mdss_mdp_ctl *ctl, void *arg) if (rc == 0) { pr_warn("vsync wait timeout %d, fallback to poll mode\n", ctl->num); - ctx->polling_en++; + ctx->polling_en = true; rc = mdss_mdp_video_pollwait(ctl); } else { rc = 0; -- cgit v1.2.3 From 12d40f1995b47ccbb29081c07591d1521d872e96 Mon Sep 17 00:00:00 2001 From: Georg Veichtlbauer Date: Mon, 26 Jun 2023 13:45:53 +0200 Subject: msm: mdss: Fix indentation Change-Id: I930b89ba4d4312bd830e396ec6f6b62a0516f725 --- drivers/video/fbdev/msm/msm_mdss_io_8974.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/video/fbdev/msm/msm_mdss_io_8974.c b/drivers/video/fbdev/msm/msm_mdss_io_8974.c index 922c4440ba82..000beebe0375 100644 --- a/drivers/video/fbdev/msm/msm_mdss_io_8974.c +++ b/drivers/video/fbdev/msm/msm_mdss_io_8974.c @@ -1321,16 +1321,16 @@ static void mdss_dsi_phy_regulator_ctrl(struct mdss_dsi_ctrl_pdata *ctrl, mdss_dsi_20nm_phy_regulator_enable(ctrl); break; default: - /* - * For dual dsi case, do not reconfigure dsi phy - * regulator if the other dsi controller is still - * active. - */ - if (!mdss_dsi_is_hw_config_dual(sdata) || - (other_ctrl && (!other_ctrl->is_phyreg_enabled - || other_ctrl->mmss_clamp))) - mdss_dsi_28nm_phy_regulator_enable(ctrl); - break; + /* + * For dual dsi case, do not reconfigure dsi phy + * regulator if the other dsi controller is still + * active. + */ + if (!mdss_dsi_is_hw_config_dual(sdata) || + (other_ctrl && (!other_ctrl->is_phyreg_enabled + || other_ctrl->mmss_clamp))) + mdss_dsi_28nm_phy_regulator_enable(ctrl); + break; } } ctrl->is_phyreg_enabled = 1; -- cgit v1.2.3 From cbe0b37059c9c491c48b0d014f2ceb0f2e9e2e96 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 7 Jun 2018 17:05:28 -0700 Subject: mm: introduce arg_lock to protect arg_start|end and env_start|end in mm_struct mmap_sem is on the hot path of kernel, and it very contended, but it is abused too. It is used to protect arg_start|end and evn_start|end when reading /proc/$PID/cmdline and /proc/$PID/environ, but it doesn't make sense since those proc files just expect to read 4 values atomically and not related to VM, they could be set to arbitrary values by C/R. And, the mmap_sem contention may cause unexpected issue like below: INFO: task ps:14018 blocked for more than 120 seconds. Tainted: G E 4.9.79-009.ali3000.alios7.x86_64 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. ps D 0 14018 1 0x00000004 Call Trace: schedule+0x36/0x80 rwsem_down_read_failed+0xf0/0x150 call_rwsem_down_read_failed+0x18/0x30 down_read+0x20/0x40 proc_pid_cmdline_read+0xd9/0x4e0 __vfs_read+0x37/0x150 vfs_read+0x96/0x130 SyS_read+0x55/0xc0 entry_SYSCALL_64_fastpath+0x1a/0xc5 Both Alexey Dobriyan and Michal Hocko suggested to use dedicated lock for them to mitigate the abuse of mmap_sem. So, introduce a new spinlock in mm_struct to protect the concurrent access to arg_start|end, env_start|end and others, as well as replace write map_sem to read to protect the race condition between prctl and sys_brk which might break check_data_rlimit(), and makes prctl more friendly to other VM operations. This patch just eliminates the abuse of mmap_sem, but it can't resolve the above hung task warning completely since the later access_remote_vm() call needs acquire mmap_sem. The mmap_sem scalability issue will be solved in the future. Change-Id: Ifa8f001ee2fc4f0ce60c18e771cebcf8a1f0943e [yang.shi@linux.alibaba.com: add comment about mmap_sem and arg_lock] Link: http://lkml.kernel.org/r/1524077799-80690-1-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1523730291-109696-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Reviewed-by: Cyrill Gorcunov Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Matthew Wilcox Cc: Mateusz Guzik Cc: Kirill Tkhai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Git-commit: 88aa7cc688d48ddd84558b41d5905a0db9535c4b Git-repo: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git Signed-off-by: Srinivas Ramana --- fs/proc/base.c | 8 ++++---- include/linux/mm_types.h | 2 ++ kernel/fork.c | 1 + kernel/sys.c | 10 ++++++++-- mm/init-mm.c | 1 + 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index e16afc80f810..c7402cb76f11 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -233,12 +233,12 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, goto out_mmput; } - down_read(&mm->mmap_sem); + spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; - up_read(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); BUG_ON(arg_start > arg_end); BUG_ON(env_start > env_end); @@ -990,10 +990,10 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!atomic_inc_not_zero(&mm->mm_users)) goto free; - down_read(&mm->mmap_sem); + spin_lock(&mm->arg_lock); env_start = mm->env_start; env_end = mm->env_end; - up_read(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); while (count > 0) { size_t this_len, max_len; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 29c17fae9bbf..1019e8d3c88f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -442,6 +442,8 @@ struct mm_struct { unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */ unsigned long stack_vm; /* VM_GROWSUP/DOWN */ unsigned long def_flags; + + spinlock_t arg_lock; /* protect the below fields */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; diff --git a/kernel/fork.c b/kernel/fork.c index a21adc0155b9..c25ebf6dd7f9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -621,6 +621,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + spin_lock_init(&mm->arg_lock); mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); diff --git a/kernel/sys.c b/kernel/sys.c index d5ea3360038c..25cf2aa72d3b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1854,7 +1854,11 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data return error; } - down_write(&mm->mmap_sem); + /* + * arg_lock protects concurent updates but we still need mmap_sem for + * read to exclude races with sys_brk. + */ + down_read(&mm->mmap_sem); /* * We don't validate if these members are pointing to @@ -1868,6 +1872,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * to any problem in kernel itself */ + spin_lock(&mm->arg_lock); mm->start_code = prctl_map.start_code; mm->end_code = prctl_map.end_code; mm->start_data = prctl_map.start_data; @@ -1879,6 +1884,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data mm->arg_end = prctl_map.arg_end; mm->env_start = prctl_map.env_start; mm->env_end = prctl_map.env_end; + spin_unlock(&mm->arg_lock); /* * Note this update of @saved_auxv is lockless thus @@ -1891,7 +1897,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - up_write(&mm->mmap_sem); + up_read(&mm->mmap_sem); return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ diff --git a/mm/init-mm.c b/mm/init-mm.c index 975e49f00f34..02a6962d5b0b 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -21,6 +21,7 @@ struct mm_struct init_mm = { .mm_count = ATOMIC_INIT(1), .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns = &init_user_ns, INIT_MM_CONTEXT(init_mm) -- cgit v1.2.3 From ebf270d24640abda4ddc8061e615facdb9b074d0 Mon Sep 17 00:00:00 2001 From: John Dias Date: Mon, 6 Aug 2018 14:08:03 -0700 Subject: sched/fair: vruntime should normalize when switching from fair When rt_mutex_setprio changes a task's scheduling class to RT, we're seeing cases where the task's vruntime is not updated correctly upon return to the fair class. Specifically, the following is being observed: - task is deactivated while still in the fair class - task is boosted to RT via rt_mutex_setprio, which changes the task to RT and calls check_class_changed. - check_class_changed leads to detach_task_cfs_rq, at which point the vruntime_normalized check sees that the task's state is TASK_WAKING, which results in skipping the subtraction of the rq's min_vruntime from the task's vruntime - later, when the prio is deboosted and the task is moved back to the fair class, the fair rq's min_vruntime is added to the task's vruntime, even though it wasn't subtracted earlier. The immediate result is inflation of the task's vruntime, giving it lower priority (starving it if there's enough available work). The longer-term effect is inflation of all vruntimes because the task's vruntime becomes the rq's min_vruntime when the higher priority tasks go idle. That leads to a vicious cycle, where the vruntime inflation repeatedly doubled. The change here is to detect when vruntime_normalized is being called when the task is waking but is waking in another class, and to conclude that this is a case where vruntime has not been normalized. Bug: 80502612 Change-Id: If0bb02eb16939ca5e91ef282b7f9119ff68622c4 Signed-off-by: John Dias --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 51443a801af5..3c0a8050b77d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11772,7 +11772,8 @@ static inline bool vruntime_normalized(struct task_struct *p) * - A task which has been woken up by try_to_wake_up() and * waiting for actually being woken up by sched_ttwu_pending(). */ - if (!se->sum_exec_runtime || p->state == TASK_WAKING) + if (!se->sum_exec_runtime || + (p->state == TASK_WAKING && p->sched_class == &fair_sched_class)) return true; return false; -- cgit v1.2.3 From 90dccbae4c0410e209049e22eaae9cd2718e1aa5 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 5 Mar 2019 15:43:06 -0800 Subject: UPSTREAM: mm: reuse only-pte-mapped KSM page in do_wp_page() Add an optimization for KSM pages almost in the same way that we have for ordinary anonymous pages. If there is a write fault in a page, which is mapped to an only pte, and it is not related to swap cache; the page may be reused without copying its content. [ Note that we do not consider PageSwapCache() pages at least for now, since we don't want to complicate __get_ksm_page(), which has nice optimization based on this (for the migration case). Currenly it is spinning on PageSwapCache() pages, waiting for when they have unfreezed counters (i.e., for the migration finish). But we don't want to make it also spinning on swap cache pages, which we try to reuse, since there is not a very high probability to reuse them. So, for now we do not consider PageSwapCache() pages at all. ] So in reuse_ksm_page() we check for 1) PageSwapCache() and 2) page_stable_node(), to skip a page, which KSM is currently trying to link to stable tree. Then we do page_ref_freeze() to prohibit KSM to merge one more page into the page, we are reusing. After that, nobody can refer to the reusing page: KSM skips !PageSwapCache() pages with zero refcount; and the protection against of all other participants is the same as for reused ordinary anon pages pte lock, page lock and mmap_sem. [akpm@linux-foundation.org: replace BUG_ON()s with WARN_ON()s] Link: http://lkml.kernel.org/r/154471491016.31352.1168978849911555609.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Reviewed-by: Yang Shi Cc: "Kirill A. Shutemov" Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Christian Koenig Cc: Claudio Imbrenda Cc: Rik van Riel Cc: Huang Ying Cc: Minchan Kim Cc: Kirill Tkhai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Change-Id: If32387b1f7c36f0e12fcbb0926bf1b67886ec594 --- include/linux/ksm.h | 7 +++++++ mm/ksm.c | 30 ++++++++++++++++++++++++++++-- mm/memory.c | 17 +++++++++++++++-- 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 481c8c4627ca..febba394f93c 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -62,6 +62,8 @@ struct page *ksm_might_need_to_copy(struct page *page, int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); +bool reuse_ksm_page(struct page *page, + struct vm_area_struct *vma, unsigned long address); #else /* !CONFIG_KSM */ @@ -102,6 +104,11 @@ static inline int rmap_walk_ksm(struct page *page, static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) { } +static inline bool reuse_ksm_page(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + return false; +} #endif /* CONFIG_MMU */ #endif /* !CONFIG_KSM */ diff --git a/mm/ksm.c b/mm/ksm.c index 2a4ef426b331..bfee36c149e6 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -580,8 +580,9 @@ again: * case this node is no longer referenced, and should be freed; * however, it might mean that the page is under page_freeze_refs(). * The __remove_mapping() case is easy, again the node is now stale; - * but if page is swapcache in migrate_page_move_mapping(), it might - * still be our page, in which case it's essential to keep the node. + * the same is in reuse_ksm_page() case; but if page is swapcache + * in migrate_page_move_mapping(), it might still be our page, + * in which case it's essential to keep the node. */ while (!get_page_unless_zero(page)) { /* @@ -2061,6 +2062,31 @@ out: return ret; } +bool reuse_ksm_page(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ +#ifdef CONFIG_DEBUG_VM + if (WARN_ON(is_zero_pfn(page_to_pfn(page))) || + WARN_ON(!page_mapped(page)) || + WARN_ON(!PageLocked(page))) { + dump_page(page, "reuse_ksm_page"); + return false; + } +#endif + + if (PageSwapCache(page) || !page_stable_node(page)) + return false; + /* Prohibit parallel get_ksm_page() */ + if (!page_ref_freeze(page, 1)) + return false; + + page_move_anon_rmap(page, vma); + page->index = linear_page_index(vma, address); + page_ref_unfreeze(page, 1); + + return true; +} #ifdef CONFIG_MIGRATION void ksm_migrate_page(struct page *newpage, struct page *oldpage) { diff --git a/mm/memory.c b/mm/memory.c index 09a57fe6ae01..ccb04d3f9bab 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2399,7 +2399,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(old_page) && !PageKsm(old_page)) { + if (PageAnon(old_page)) { + if (PageKsm(old_page) && (PageSwapCache(old_page) || + page_count(old_page) != 1)) + goto copy; if (!trylock_page(old_page)) { page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); @@ -2414,6 +2417,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } page_cache_release(old_page); } + if (PageKsm(old_page)) { + bool reused = reuse_ksm_page(old_page, vma, + address); + unlock_page(old_page); + if (!reused) + goto copy; + wp_page_reuse(mm, vma, address, page_table, ptl, + orig_pte, old_page, 0, 0); + return VM_FAULT_WRITE; + } if (reuse_swap_page(old_page)) { /* * The page is all ours. Move it to our anon_vma so @@ -2431,7 +2444,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, return wp_page_shared(mm, vma, address, page_table, pmd, ptl, orig_pte, old_page); } - +copy: /* * Ok, we need to copy. Oh, well.. */ -- cgit v1.2.3 From 46c6fbdd185a7005fac895c5cfa7896b9329324f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 21 Aug 2020 19:49:55 -0400 Subject: BACKPORT: mm: do_wp_page() simplification How about we just make sure we're the only possible valid user fo the page before we bother to reuse it? Simplify, simplify, simplify. And get rid of the nasty serialization on the page lock at the same time. [peterx: add subject prefix] Signed-off-by: Linus Torvalds Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds Change-Id: I25c2cf9a3afebc19aaeda29ffbfcb82edb7fcb7c --- mm/memory.c | 49 ++++++++++++++----------------------------------- 1 file changed, 14 insertions(+), 35 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index ccb04d3f9bab..8e44f5d2d5f4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2400,45 +2400,24 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * not dirty accountable. */ if (PageAnon(old_page)) { - if (PageKsm(old_page) && (PageSwapCache(old_page) || - page_count(old_page) != 1)) + /* PageKsm() doesn't necessarily raise the page refcount */ + if (PageKsm(old_page) || page_count(old_page) != 1) goto copy; - if (!trylock_page(old_page)) { - page_cache_get(old_page); - pte_unmap_unlock(page_table, ptl); - lock_page(old_page); - page_table = pte_offset_map_lock(mm, pmd, address, - &ptl); - if (!pte_same(*page_table, orig_pte)) { - unlock_page(old_page); - pte_unmap_unlock(page_table, ptl); - page_cache_release(old_page); - return 0; - } - page_cache_release(old_page); - } - if (PageKsm(old_page)) { - bool reused = reuse_ksm_page(old_page, vma, - address); - unlock_page(old_page); - if (!reused) - goto copy; - wp_page_reuse(mm, vma, address, page_table, ptl, - orig_pte, old_page, 0, 0); - return VM_FAULT_WRITE; - } - if (reuse_swap_page(old_page)) { - /* - * The page is all ours. Move it to our anon_vma so - * the rmap code will not search our parent or siblings. - * Protected against the rmap code by the page lock. - */ - page_move_anon_rmap(old_page, vma, address); + if (!trylock_page(old_page)) + goto copy; + if (PageKsm(old_page) || page_mapcount(old_page) != 1 || page_count(old_page) != 1) { unlock_page(old_page); - return wp_page_reuse(mm, vma, address, page_table, ptl, - orig_pte, old_page, 0, 0); + goto copy; } + /* + * Ok, we've got the only map reference, and the only + * page count reference, and the page is locked, + * it's dark out, and we're wearing sunglasses. Hit it. + */ + wp_page_reuse(mm, vma, address, page_table, ptl, + orig_pte, old_page, 0, 0); unlock_page(old_page); + return VM_FAULT_WRITE; } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { return wp_page_shared(mm, vma, address, page_table, pmd, -- cgit v1.2.3 From 899def5edcd48ff16c0f41624ecf8be77c81a18d Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 21 Aug 2020 19:49:56 -0400 Subject: UPSTREAM: mm/ksm: Remove reuse_ksm_page() Remove the function as the last reference has gone away with the do_wp_page() changes. Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds Change-Id: Ie4da88791abe9407157566854b2db9b94c0c962f --- include/linux/ksm.h | 7 ------- mm/ksm.c | 25 ------------------------- 2 files changed, 32 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index febba394f93c..481c8c4627ca 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -62,8 +62,6 @@ struct page *ksm_might_need_to_copy(struct page *page, int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); -bool reuse_ksm_page(struct page *page, - struct vm_area_struct *vma, unsigned long address); #else /* !CONFIG_KSM */ @@ -104,11 +102,6 @@ static inline int rmap_walk_ksm(struct page *page, static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) { } -static inline bool reuse_ksm_page(struct page *page, - struct vm_area_struct *vma, unsigned long address) -{ - return false; -} #endif /* CONFIG_MMU */ #endif /* !CONFIG_KSM */ diff --git a/mm/ksm.c b/mm/ksm.c index bfee36c149e6..98920d1d59ed 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2062,31 +2062,6 @@ out: return ret; } -bool reuse_ksm_page(struct page *page, - struct vm_area_struct *vma, - unsigned long address) -{ -#ifdef CONFIG_DEBUG_VM - if (WARN_ON(is_zero_pfn(page_to_pfn(page))) || - WARN_ON(!page_mapped(page)) || - WARN_ON(!PageLocked(page))) { - dump_page(page, "reuse_ksm_page"); - return false; - } -#endif - - if (PageSwapCache(page) || !page_stable_node(page)) - return false; - /* Prohibit parallel get_ksm_page() */ - if (!page_ref_freeze(page, 1)) - return false; - - page_move_anon_rmap(page, vma); - page->index = linear_page_index(vma, address); - page_ref_unfreeze(page, 1); - - return true; -} #ifdef CONFIG_MIGRATION void ksm_migrate_page(struct page *newpage, struct page *oldpage) { -- cgit v1.2.3 From 45df1516d04a155794e56e6ded1c4813a3e56048 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 24 Sep 2020 08:41:32 -0700 Subject: UPSTREAM: mm: fix misplaced unlock_page in do_wp_page() Commit 09854ba94c6a ("mm: do_wp_page() simplification") reorganized all the code around the page re-use vs copy, but in the process also moved the final unlock_page() around to after the wp_page_reuse() call. That normally doesn't matter - but it means that the unlock_page() is now done after releasing the page table lock. Again, not a big deal, you'd think. But it turns out that it's very wrong indeed, because once we've released the page table lock, we've basically lost our only reference to the page - the page tables - and it could now be free'd at any time. We do hold the mmap_sem, so no actual unmap() can happen, but madvise can come in and a MADV_DONTNEED will zap the page range - and free the page. So now the page may be free'd just as we're unlocking it, which in turn will usually trigger a "Bad page state" error in the freeing path. To make matters more confusing, by the time the debug code prints out the page state, the unlock has typically completed and everything looks fine again. This all doesn't happen in any normal situations, but it does trigger with the dirtyc0w_child LTP test. And it seems to trigger much more easily (but not expclusively) on s390 than elsewhere, probably because s390 doesn't do the "batch pages up for freeing after the TLB flush" that gives the unlock_page() more time to complete and makes the race harder to hit. Fixes: 09854ba94c6a ("mm: do_wp_page() simplification") Link: https://lore.kernel.org/lkml/a46e9bbef2ed4e17778f5615e818526ef848d791.camel@redhat.com/ Link: https://lore.kernel.org/linux-mm/c41149a8-211e-390b-af1d-d5eee690fecb@linux.alibaba.com/ Reported-by: Qian Cai Reported-by: Alex Shi Bisected-and-analyzed-by: Gerald Schaefer Tested-by: Gerald Schaefer Signed-off-by: Linus Torvalds Change-Id: Idae0eb8d7478ab61de567ca9e446df4cc4dfc2a0 --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 8e44f5d2d5f4..0c69908d3eed 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2414,9 +2414,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * page count reference, and the page is locked, * it's dark out, and we're wearing sunglasses. Hit it. */ + unlock_page(old_page); wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, old_page, 0, 0); - unlock_page(old_page); return VM_FAULT_WRITE; } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { -- cgit v1.2.3 From c0b317c27d445025c40d2f3af1a052115e027e5e Mon Sep 17 00:00:00 2001 From: Tengfei Fan Date: Mon, 19 Nov 2018 13:45:29 +0800 Subject: pinctrl: qcom: Clear status bit on irq_unmask The gpio interrupt status bit is getting set after the irq is disabled and causing an immediate interrupt after enablling the irq, so clear status bit on irq_unmask. Change-Id: I89245b90b06b37671369e59c15fb24a991cc114a Signed-off-by: Tengfei Fan --- drivers/pinctrl/qcom/pinctrl-msm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/pinctrl/qcom/pinctrl-msm.c b/drivers/pinctrl/qcom/pinctrl-msm.c index d4a1f5378ac5..ee8c09717597 100644 --- a/drivers/pinctrl/qcom/pinctrl-msm.c +++ b/drivers/pinctrl/qcom/pinctrl-msm.c @@ -631,6 +631,7 @@ static void msm_gpio_irq_enable(struct irq_data *d) static void msm_gpio_irq_unmask(struct irq_data *d) { struct gpio_chip *gc = irq_data_get_irq_chip_data(d); + uint32_t irqtype = irqd_get_trigger_type(d); struct msm_pinctrl *pctrl = to_msm_pinctrl(gc); const struct msm_pingroup *g; unsigned long flags; @@ -640,6 +641,12 @@ static void msm_gpio_irq_unmask(struct irq_data *d) spin_lock_irqsave(&pctrl->lock, flags); + if (irqtype & (IRQF_TRIGGER_HIGH | IRQF_TRIGGER_LOW)) { + val = readl_relaxed(pctrl->regs + g->intr_status_reg); + val &= ~BIT(g->intr_status_bit); + writel_relaxed(val, pctrl->regs + g->intr_status_reg); + } + val = readl(pctrl->regs + g->intr_status_reg); val &= ~BIT(g->intr_status_bit); writel(val, pctrl->regs + g->intr_status_reg); -- cgit v1.2.3 From a9314f9d8ad402f17e107f2f4a11636e50301cfa Mon Sep 17 00:00:00 2001 From: Maria Yu Date: Mon, 15 Apr 2019 12:41:12 +0800 Subject: sched/fair: Allow load bigger task load balance when nr_running is 2 When there is only 2 tasks in 1 cpu and the other task is currently running, allow load bigger task to be balanced if the other task is currently running. Change-Id: I489e9624ba010f9293272a67585e8209a786b787 Signed-off-by: Maria Yu --- kernel/sched/fair.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3c0a8050b77d..941424604fdd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8953,7 +8953,17 @@ redo: if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) goto next; - if ((load / 2) > env->imbalance) + /* + * p is not running task when we goes until here, so if p is one + * of the 2 task in src cpu rq and not the running one, + * that means it is the only task that can be balanced. + * So only when there is other tasks can be balanced or + * there is situation to ignore big task, it is needed + * to skip the task load bigger than 2*imbalance. + */ + if (((cpu_rq(env->src_cpu)->nr_running > 2) || + (env->flags & LBF_IGNORE_BIG_TASKS)) && + ((load / 2) > env->imbalance)) goto next; detach_task(p, env); -- cgit v1.2.3 From b9b6bc6ea3c06ab2edac96db7a3f9e51c9e459d1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 17 Jun 2017 08:10:08 -0400 Subject: sched: Allow migrating kthreads into online but inactive CPUs Per-cpu workqueues have been tripping CPU affinity sanity checks while a CPU is being offlined. A per-cpu kworker ends up running on a CPU which isn't its target CPU while the CPU is online but inactive. While the scheduler allows kthreads to wake up on an online but inactive CPU, it doesn't allow a running kthread to be migrated to such a CPU, which leads to an odd situation where setting affinity on a sleeping and running kthread leads to different results. Each mem-reclaim workqueue has one rescuer which guarantees forward progress and the rescuer needs to bind itself to the CPU which needs help in making forward progress; however, due to the above issue, while set_cpus_allowed_ptr() succeeds, the rescuer doesn't end up on the correct CPU if the CPU is in the process of going offline, tripping the sanity check and executing the work item on the wrong CPU. This patch updates __migrate_task() so that kthreads can be migrated into an inactive but online CPU. Change-Id: I38cc3eb3b2ec5b7034cc72a2bcdd32a549314915 Signed-off-by: Tejun Heo Reported-by: "Paul E. McKenney" Reported-by: Steven Rostedt --- kernel/sched/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b33433586774..3c64cd08e8e2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1144,8 +1144,13 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_ { int src_cpu; - if (unlikely(!cpu_active(dest_cpu))) - return rq; + if (p->flags & PF_KTHREAD) { + if (unlikely(!cpu_online(dest_cpu))) + return ret; + } else { + if (unlikely(!cpu_active(dest_cpu))) + return ret; + } /* Affinity changed (again). */ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) -- cgit v1.2.3 From c9999f04236e640bc82b2085cadca8487184bc76 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 16 Jun 2016 15:35:04 -0400 Subject: sched/core: Allow kthreads to fall back to online && !active cpus During CPU hotplug, CPU_ONLINE callbacks are run while the CPU is online but not active. A CPU_ONLINE callback may create or bind a kthread so that its cpus_allowed mask only allows the CPU which is being brought online. The kthread may start executing before the CPU is made active and can end up in select_fallback_rq(). In such cases, the expected behavior is selecting the CPU which is coming online; however, because select_fallback_rq() only chooses from active CPUs, it determines that the task doesn't have any viable CPU in its allowed mask and ends up overriding it to cpu_possible_mask. CPU_ONLINE callbacks should be able to put kthreads on the CPU which is coming online. Update select_fallback_rq() so that it follows cpu_online() rather than cpu_active() for kthreads. Reported-by: Gautham R Shenoy Tested-by: Gautham R. Shenoy Change-Id: I562dcc53717b1f2f8324abffb652b91592ba8d5c Signed-off-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Cc: Abdul Haleem Cc: Aneesh Kumar Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team@fb.com Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20160616193504.GB3262@mtj.duckdns.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3c64cd08e8e2..e83959592d6c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1654,9 +1654,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso) for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { - if (!cpu_online(dest_cpu)) + if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) continue; - if (!cpu_active(dest_cpu)) + if (!cpu_online(dest_cpu)) continue; if (cpu_isolated(dest_cpu)) { if (allow_iso) -- cgit v1.2.3 From 0ffdb886996b66e1eb9bbda6e9605bf7dbca0cd1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Jul 2017 18:58:21 +0200 Subject: BACKPORT: sched/core: Fix rules for running on online && !active CPUs As already enforced by the WARN() in __set_cpus_allowed_ptr(), the rules for running on an online && !active CPU are stricter than just being a kthread, you need to be a per-cpu kthread. If you're not strictly per-CPU, you have better CPUs to run on and don't need the partially booted one to get your work done. The exception is to allow smpboot threads to bootstrap the CPU itself and get kernel 'services' initialized before we allow userspace on it. Change-Id: I515e873a6e5be0cde7771ecedf56101614300fe2 Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Tejun Heo Cc: Thomas Gleixner Fixes: 955dbdf4ce87 ("sched: Allow migrating kthreads into online but inactive CPUs") Link: http://lkml.kernel.org/r/20170725165821.cejhb7v2s3kecems@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar Backported to 4.4 Signed-off-by: joshuous --- kernel/sched/core.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e83959592d6c..f73831486c2f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1085,6 +1085,33 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP + +static inline bool is_per_cpu_kthread(struct task_struct *p) +{ + if (!(p->flags & PF_KTHREAD)) + return false; + + if (p->nr_cpus_allowed != 1) + return false; + + return true; +} + +/* + * Per-CPU kthreads are allowed to run on !actie && online CPUs, see + * __set_cpus_allowed_ptr() and select_fallback_rq(). + */ +static inline bool is_cpu_allowed(struct task_struct *p, int cpu) +{ + if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + return false; + + if (is_per_cpu_kthread(p)) + return cpu_online(cpu); + + return cpu_active(cpu); +} + /* * This is how migration works: * @@ -1144,16 +1171,8 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_ { int src_cpu; - if (p->flags & PF_KTHREAD) { - if (unlikely(!cpu_online(dest_cpu))) - return ret; - } else { - if (unlikely(!cpu_active(dest_cpu))) - return ret; - } - /* Affinity changed (again). */ - if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) + if (!is_cpu_allowed(p, dest_cpu)) return rq; src_cpu = cpu_of(rq); @@ -1654,9 +1673,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso) for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { - if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) - continue; - if (!cpu_online(dest_cpu)) + if (!is_cpu_allowed(p, dest_cpu)) continue; if (cpu_isolated(dest_cpu)) { if (allow_iso) -- cgit v1.2.3 From eccc8acbe705a20e0911ea776371d84eba53cc8e Mon Sep 17 00:00:00 2001 From: Maria Yu Date: Fri, 26 Apr 2019 15:20:18 +0800 Subject: sched/fair: Avoid unnecessary active load balance When find busiest group, it will avoid load balance if it is only 1 task running on src cpu. Consider race when different cpus do newly idle load balance at the same time, check src cpu nr_running to avoid unnecessary active load balance again. See the race condition example here: 1) cpu2 have 2 tasks, so cpu2 rq->nr_running == 2 and cfs.h_nr_running ==2. 2) cpu4 and cpu5 doing newly idle load balance at the same time. 3) cpu4 and cpu5 both see cpu2 sched_load_balance_sg_stats sum_nr_run=2 so they are both see cpu2 as the busiest rq. 4) cpu5 did a success migration task from cpu2, so cpu2 only have 1 task left, cpu2 rq->nr_running == 1 and cfs.h_nr_running ==1. 5) cpu4 surely goes to no_move because currently cpu4 only have 1 task which is currently running. 6) and then cpu4 goes here to check if cpu2 need active load balance. Change-Id: Ia9539a43e9769c4936f06ecfcc11864984c50c29 Signed-off-by: Maria Yu --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 941424604fdd..42f05c742846 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10456,8 +10456,10 @@ static int need_active_balance(struct lb_env *env) * It's worth migrating the task if the src_cpu's capacity is reduced * because of other sched_class or IRQs if more capacity stays * available on dst_cpu. + * Avoid pulling the CFS task if it is the only task running. */ if ((env->idle != CPU_NOT_IDLE) && + (env->src_rq->nr_running > 1) && (env->src_rq->cfs.h_nr_running == 1)) { if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) -- cgit v1.2.3 From 8d8a48aecde5c4be6c57b9108dc22e8e0cd7f235 Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Fri, 19 May 2017 23:49:11 -0400 Subject: sched/fair: Fix load_balance() affinity redo path If load_balance() fails to migrate any tasks because all tasks were affined, load_balance() removes the source cpu from consideration and attempts to redo and balance among the new subset of cpus. There is a bug in this code path where the algorithm considers all active cpus in the system (minus the source that was just masked out). This is not valid for two reasons: some active cpus may not be in the current scheduling domain and one of the active cpus is dst_cpu. These cpus should not be considered, as we cannot pull load from them. Instead of failing out of load_balance(), we may end up redoing the search with no valid cpus and incorrectly concluding the domain is balanced. Additionally, if the group_imbalance flag was just set, it may also be incorrectly unset, thus the flag will not be seen by other cpus in future load_balance() runs as that algorithm intends. Fix the check by removing cpus not in the current domain and the dst_cpu from considertation, thus limiting the evaluation to valid remaining cpus from which load might be migrated. Co-authored-by: Austin Christ Co-authored-by: Dietmar Eggemann Signed-off-by: Jeffrey Hugo Tested-by: Tyler Baicar Change-Id: Ife6701c9c62e7155493d9db9398f08c4474e94b3 --- kernel/sched/fair.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 42f05c742846..a2f52c35c76a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10697,7 +10697,24 @@ more_balance: /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); - if (!cpumask_empty(cpus)) { + /* + * dst_cpu is not a valid busiest cpu in the following + * check since load cannot be pulled from dst_cpu to be + * put on dst_cpu. + */ + cpumask_clear_cpu(env.dst_cpu, cpus); + /* + * Go back to "redo" iff the load-balance cpumask + * contains other potential busiest cpus for the + * current sched domain. + */ + if (cpumask_intersects(cpus, sched_domain_span(env.sd))) { + /* + * Now that the check has passed, reenable + * dst_cpu so that load can be calculated on + * it in the redo path. + */ + cpumask_set_cpu(env.dst_cpu, cpus); env.loop = 0; env.loop_break = sched_nr_migrate_break; goto redo; -- cgit v1.2.3 From 938e42ca699f3224fcb9687bf9feba3f4a1abf32 Mon Sep 17 00:00:00 2001 From: Maulik Shah Date: Thu, 21 Jun 2018 11:48:23 +0530 Subject: drivers: cpuidle: lpm-levels: Correctly check for list empty Correctly check for list empty condition to get least cluster latency. Change-Id: I6584a8d6d77794ca506c994d927467e9c1fefa63 Signed-off-by: Maulik Shah --- drivers/cpuidle/lpm-levels.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpuidle/lpm-levels.c b/drivers/cpuidle/lpm-levels.c index 1eaef20e5ed5..f46126e41266 100644 --- a/drivers/cpuidle/lpm-levels.c +++ b/drivers/cpuidle/lpm-levels.c @@ -187,7 +187,7 @@ static uint32_t least_cluster_latency(struct lpm_cluster *cluster, uint32_t latency = 0; int i; - if (!cluster->list.next) { + if (list_empty(&cluster->list)) { for (i = 0; i < cluster->nlevels; i++) { level = &cluster->levels[i]; pwr_params = &level->pwr; -- cgit v1.2.3 From c71b8fffe6b3d099b76c05f922fde8aa4b6c2334 Mon Sep 17 00:00:00 2001 From: Lina Iyer Date: Mon, 17 Jul 2017 11:50:25 -0600 Subject: drivers: cpuidle: lpm-levels: Fix KW issues with idle state idx < 0 Idle state calculcations will need to return the state chosen as an integer. The state chosen is used as a index into the array and as such cannot be negative value. Do not return negative errors from the calculations. By default, the state returned wil be zero. Change-Id: Idb18e933f385cf868fe99fa6a2783f6b8e84c196 Signed-off-by: Lina Iyer --- drivers/cpuidle/lpm-levels.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/drivers/cpuidle/lpm-levels.c b/drivers/cpuidle/lpm-levels.c index f46126e41266..dca59eadc6c2 100644 --- a/drivers/cpuidle/lpm-levels.c +++ b/drivers/cpuidle/lpm-levels.c @@ -691,7 +691,7 @@ static void update_history(struct cpuidle_device *dev, int idx); static int cpu_power_select(struct cpuidle_device *dev, struct lpm_cpu *cpu) { - int best_level = -1; + int best_level = 0; uint32_t latency_us = pm_qos_request_for_cpu(PM_QOS_CPU_DMA_LATENCY, dev->cpu); s64 sleep_us = ktime_to_us(tick_nohz_get_sleep_length()); @@ -705,8 +705,6 @@ static int cpu_power_select(struct cpuidle_device *dev, uint32_t *min_residency = get_per_cpu_min_residency(dev->cpu); uint32_t *max_residency = get_per_cpu_max_residency(dev->cpu); - if (!cpu) - return -EINVAL; if ((sleep_disabled && !cpu_isolated(dev->cpu)) || sleep_us < 0) return 0; @@ -1536,17 +1534,11 @@ static int lpm_cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct lpm_cluster *cluster = per_cpu(cpu_cluster, dev->cpu); - int idx; if (!cluster) return 0; - idx = cpu_power_select(dev, cluster->cpu); - - if (idx < 0) - return -EPERM; - - return idx; + return cpu_power_select(dev, cluster->cpu); } static void update_history(struct cpuidle_device *dev, int idx) @@ -1591,9 +1583,6 @@ static int lpm_cpuidle_enter(struct cpuidle_device *dev, int64_t start_time = ktime_to_ns(ktime_get()), end_time; struct power_params *pwr_params; - if (idx < 0) - return -EINVAL; - pwr_params = &cluster->cpu->levels[idx].pwr; sched_set_cpu_cstate(smp_processor_id(), idx + 1, pwr_params->energy_overhead, pwr_params->latency_us); -- cgit v1.2.3 From 43cbf9d6153d820d68ba96ddb1a57dbdc7373c90 Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Wed, 27 Mar 2019 13:31:40 +0530 Subject: sched/tune: Increase the cgroup limit to 6 The schedtune cgroup controller allows upto 5 cgroups including the default/root cgroup. Until now the user space is creating only 4 additional cgroups namely, foreground, background, top-app and audio-app. Recently another cgroup called rt is created before the audio-app cgroup. Since kernel limits the cgroups to 5, the creation of audio-app cgroup is failing. Fix this by increasing the schedtune cgroup controller cgroup limit to 6. Change-Id: I13252a90dba9b8010324eda29b8901cb0b20bc21 Signed-off-by: Pavankumar Kondeti --- kernel/sched/tune.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index b84d13750604..d8beda0d74b4 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -240,7 +240,7 @@ schedtune_accept_deltas(int nrg_delta, int cap_delta, * implementation especially for the computation of the per-CPU boost * value */ -#define BOOSTGROUPS_COUNT 5 +#define BOOSTGROUPS_COUNT 6 /* Array of configured boostgroups */ static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { -- cgit v1.2.3 From 41cbb7bc59fb94f020839f20b890033e2f407ca3 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Thu, 1 Jun 2017 10:59:12 -0700 Subject: sched: walt: fix window misalignment when HZ=300 Due to rounding error hrtimer tick interval becomes 3333333 ns when HZ=300. Consequently the tick time stamp nearest to the WALT's default window size 20ms will be also 19999998 (3333333 * 6). [beykerykt]: Adapt for HMP Change-Id: I08f9bd2dbecccbb683e4490d06d8b0da703d3ab2 Suggested-by: Joel Fernandes Signed-off-by: Joonwoo Park --- kernel/sched/hmp.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 598656b42203..f9fff7217eaa 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -765,7 +765,15 @@ unsigned int min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ /* Min window size (in ns) = 10ms */ +#ifdef CONFIG_HZ_300 +/* + * Tick interval becomes to 3333333 due to + * rounding error when HZ=300. + */ +#define MIN_SCHED_RAVG_WINDOW (3333333 * 6) +#else #define MIN_SCHED_RAVG_WINDOW 10000000 +#endif /* Max window size (in ns) = 1s */ #define MAX_SCHED_RAVG_WINDOW 1000000000 -- cgit v1.2.3 From 0fa652ee00f5aaf9fdebea0e0f840e59bdb6795b Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Thu, 10 Aug 2017 17:26:20 -0700 Subject: sched: walt: Correct WALT window size initialization It is preferable that WALT window rollover occurs just before a tick, since the tick is an opportune moment to record a complete window's statistics, as well as report those stats to the cpu frequency governor. When CONFIG_HZ results in a TICK_NSEC that isn't a integral number, this requirement may be violated. Account for this by reducing the WALT window size to the nearest multiple of TICK_NSEC. Commit d368c6faa19b ("sched: walt: fix window misalignment when HZ=300") attempted to do this but WALT isn't using MIN_SCHED_RAVG_WINDOW as the window size and the patch was doing nothing. Also, change the type of 'walt_disabled' to bool and warn if an invalid window size causes WALT to be disabled. [beykerykt]: Adapt for HMP Change-Id: Ie3dcfc21a3df4408254ca1165a355bbe391ed5c7 Signed-off-by: Vikram Mulukutla --- kernel/sched/hmp.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index f9fff7217eaa..1755e919f8f4 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -765,20 +765,15 @@ unsigned int min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ /* Min window size (in ns) = 10ms */ -#ifdef CONFIG_HZ_300 -/* - * Tick interval becomes to 3333333 due to - * rounding error when HZ=300. - */ -#define MIN_SCHED_RAVG_WINDOW (3333333 * 6) -#else -#define MIN_SCHED_RAVG_WINDOW 10000000 -#endif +#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC) /* Max window size (in ns) = 1s */ -#define MAX_SCHED_RAVG_WINDOW 1000000000 +#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC) -/* Window size (in ns) */ +/* + * Window size (in ns). Adjust for the tick size so that the window + * rollover occurs just before the tick boundary. + */ __read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW; /* Maximum allowed threshold before freq aggregation must be enabled */ @@ -1624,17 +1619,20 @@ static inline int exiting_task(struct task_struct *p) static int __init set_sched_ravg_window(char *str) { + unsigned int adj_window; unsigned int window_size; get_option(&str, &window_size); - if (window_size < MIN_SCHED_RAVG_WINDOW || - window_size > MAX_SCHED_RAVG_WINDOW) { - WARN_ON(1); - return -EINVAL; - } + /* Adjust for CONFIG_HZ */ + adj_window = (window_size / TICK_NSEC) * TICK_NSEC; + + /* Warn if we're a bit too far away from the expected window size */ + WARN(adj_window < window_size - NSEC_PER_MSEC, + "tick-adjusted window size %u, original was %u\n", adj_window, + window_size); - sched_ravg_window = window_size; + sched_ravg_window = adj_window; return 0; } -- cgit v1.2.3 From 6adb092856e806d91f3fc22dff0ef36506dd0bae Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Tue, 6 Jun 2017 11:58:27 -0700 Subject: sched: cpufreq: Limit governor updates to WALT changes alone It's not necessary to keep reporting load to the governor if it doesn't change in a window. Limit updates to when we expect load changes - after window rollover and when we send updates related to intercluster migrations. [beykerykt]: Adapt for HMP Change-Id: I3232d40f3d54b0b81cfafdcdb99b534df79327bf Signed-off-by: Vikram Mulukutla --- include/linux/sched.h | 1 + kernel/sched/hmp.c | 6 ++++-- kernel/sched/sched.h | 4 +++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9cb6964d178e..0e8fff43cc17 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3724,6 +3724,7 @@ static inline unsigned long rlimit_max(unsigned int limit) #define SCHED_CPUFREQ_DL (1U << 1) #define SCHED_CPUFREQ_IOWAIT (1U << 2) #define SCHED_CPUFREQ_INTERCLUSTER_MIG (1U << 3) +#define SCHED_CPUFREQ_WALT (1U << 4) #ifdef CONFIG_CPU_FREQ struct update_util_data { diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 1755e919f8f4..6a403be2ae7c 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -3663,8 +3663,10 @@ void fixup_busy_time(struct task_struct *p, int new_cpu) migrate_top_tasks(p, src_rq, dest_rq); if (!same_freq_domain(new_cpu, task_cpu(p))) { - cpufreq_update_util(dest_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG); - cpufreq_update_util(src_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG); + cpufreq_update_util(dest_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG | + SCHED_CPUFREQ_WALT); + cpufreq_update_util(src_rq, SCHED_CPUFREQ_INTERCLUSTER_MIG | + SCHED_CPUFREQ_WALT); } if (p == src_rq->ed_task) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 90cc450dff7e..40da1a509ded 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2856,8 +2856,10 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) #ifdef CONFIG_SCHED_HMP /* * Skip if we've already reported, but not if this is an inter-cluster - * migration + * migration. Also only allow WALT update sites. */ + if (!(flags & SCHED_CPUFREQ_WALT)) + return; if (!sched_disable_window_stats && (rq->load_reported_window == rq->window_start) && !(flags & SCHED_CPUFREQ_INTERCLUSTER_MIG)) -- cgit v1.2.3 From c7128748614ad0ca5bf9533ea9723bdd2ddf8838 Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Mon, 28 May 2018 15:39:20 +0530 Subject: sched/cpupri: Exclude isolated CPUs from the lowest_mask The cpupri_find() returns the candidate CPUs which are running lower priority than the waking RT task in the lowest_mask. This contains isolated CPUs as well. Since the energy aware CPU selection skips isolated CPUs, no target CPU may be found if all unisolated CPUs are running higher priority RT tasks. In which case, we fallback to the default CPU selection algorithm and returns an isolated CPU. This decision is reversed by select_task_rq() and returns an unisolated CPU that is busy with other RT tasks. This RT task packing is desired behavior. However, RT push mechanism pushes the packed RT task to an isolated CPU. This can be avoided by excluding isolated CPUs from the lowest_mask returned by cpupri_find(). Change-Id: I75486b3935caf496a638d0333565beffc47fe249 Signed-off-by: Pavankumar Kondeti --- kernel/sched/cpupri.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 14225d5d8617..867cb7877511 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -133,6 +133,8 @@ retry: if (lowest_mask) { cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + cpumask_andnot(lowest_mask, lowest_mask, + cpu_isolated_mask); if (drop_nopreempts) drop_nopreempt_cpus(lowest_mask); /* -- cgit v1.2.3 From ef3fb04c7df43dfa1793e33f764a2581cda96310 Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Mon, 8 May 2017 19:20:22 -0700 Subject: sched: cpufreq: Use sched_clock instead of rq_clock when updating schedutil rq_clock may not be updated often enough for schedutil or other cpufreq governors to work correctly when it's passed as the timestamp for a load report. Use sched_clock instead. [beykerykt]: Switch to sched_ktime_clock() Change-Id: I745b727870a31da25f766c2c2f37527f568c20da Signed-off-by: Vikram Mulukutla --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 40da1a509ded..e78a3e867472 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2869,7 +2869,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); if (data) - data->func(data, rq_clock(rq), flags); + data->func(data, sched_ktime_clock(), flags); } static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) -- cgit v1.2.3 From 4dbe44554792f83b785eed187aa1bcd69e84094c Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Tue, 9 May 2017 17:49:47 -0700 Subject: sched: cpufreq: Use per_cpu_ptr instead of this_cpu_ptr when reporting load We need cpufreq_update_util to report load for the CPU corresponding to the rq that is passed in as an argument, rather than the CPU executing cpufreq_update_util. Change-Id: I8473f230d40928d5920c614760e96fef12745d5a Signed-off-by: Vikram Mulukutla --- kernel/sched/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e78a3e867472..1196276eddf6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2867,7 +2867,8 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) rq->load_reported_window = rq->window_start; #endif - data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, + cpu_of(rq))); if (data) data->func(data, sched_ktime_clock(), flags); } -- cgit v1.2.3 From aee7a16e347b610fbb50d743c42c21e56cf11211 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Mon, 8 May 2017 19:39:27 -0700 Subject: sched: WALT: increase WALT minimum window size to 20ms Increase WALT minimum window size to 20ms. 10ms isn't large enough capture workload's pattern. [beykerykt}: Adapt for HMP Change-Id: I4d69577fbfeac2bc23db4ff414939cc51ada30d6 Signed-off-by: Joonwoo Park --- kernel/sched/hmp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index 6a403be2ae7c..649d6a437a13 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -764,8 +764,8 @@ unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */ unsigned int min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */ -/* Min window size (in ns) = 10ms */ -#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC) +/* Min window size (in ns) = 20ms */ +#define MIN_SCHED_RAVG_WINDOW ((20000000 / TICK_NSEC) * TICK_NSEC) /* Max window size (in ns) = 1s */ #define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC) -- cgit v1.2.3 From db74739c86de566ca88a8affa96d2d59cfbdbb98 Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Fri, 14 Sep 2018 13:59:59 +0530 Subject: sched: Don't fail isolation request for an already isolated CPU When isolating a CPU, a check is performed to see if there is only 1 active CPU in the system. If that is the case, the CPU is not isolated. However this check is done before testing if the requested CPU is already isolated or not. If the requested CPU is already isolated, there is no need to fail the isolation even when there is only 1 active CPU in the system. For example, 0-6 CPUs are isolated on a 8 CPU machine. When an isolation request comes for CPU6, which is already isolated, the current code fail the requesting thinking we end up with no active CPU in the system. Change-Id: I28fea4ff67ffed82465e5cfa785414069e4a180a Signed-off-by: Pavankumar Kondeti --- kernel/sched/core.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f73831486c2f..c597f6c61115 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5967,12 +5967,6 @@ int sched_isolate_cpu(int cpu) cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask); - /* We cannot isolate ALL cpus in the system */ - if (cpumask_weight(&avail_cpus) == 1) { - ret_code = -EINVAL; - goto out; - } - if (!cpu_online(cpu)) { ret_code = -EINVAL; goto out; @@ -5981,6 +5975,13 @@ int sched_isolate_cpu(int cpu) if (++cpu_isolation_vote[cpu] > 1) goto out; + /* We cannot isolate ALL cpus in the system */ + if (cpumask_weight(&avail_cpus) == 1) { + --cpu_isolation_vote[cpu]; + ret_code = -EINVAL; + goto out; + } + /* * There is a race between watchdog being enabled by hotplug and * core isolation disabling the watchdog. When a CPU is hotplugged in -- cgit v1.2.3 From d5cd35f38616a70453da067eda153f5dc5ede3a1 Mon Sep 17 00:00:00 2001 From: Marco Ballesio Date: Tue, 9 Mar 2021 16:35:45 -0800 Subject: FROMGIT: binder: use EINTR for interrupted wait for work when interrupted by a signal, binder_wait_for_work currently returns -ERESTARTSYS. This error code isn't propagated to user space, but a way to handle interruption due to signals must be provided to code using this API. Replace this instance of -ERESTARTSYS with -EINTR, which is propagated to user space. Bug: 180989544 (cherry picked from commit 48f10b7ed0c23e2df7b2c752ad1d3559dad007f9 git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git char-misc-testing) Signed-off-by: Marco Ballesio Signed-off-by: Li Li Test: built, booted, interrupted a worker thread within Acked-by: Todd Kjos Link: https://lore.kernel.org/r/20210316011630.1121213-3-dualli@chromium.org Signed-off-by: Greg Kroah-Hartman Change-Id: Ie6c7993cab699bc2c1a25a2f9d94b200a1156e5d --- drivers/android/binder.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 7c584e2ea476..370f1452710f 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -4062,7 +4062,7 @@ static int binder_wait_for_work(struct binder_thread *thread, binder_inner_proc_lock(proc); list_del_init(&thread->waiting_thread_node); if (signal_pending(current)) { - ret = -ERESTARTSYS; + ret = -EINTR; break; } } @@ -4991,7 +4991,7 @@ err: if (thread) thread->looper_need_return = false; wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); - if (ret && ret != -ERESTARTSYS) + if (ret && ret != -EINTR) pr_info("%d:%d ioctl %x %lx returned %d\n", proc->pid, current->pid, cmd, arg, ret); err_unlocked: trace_binder_ioctl_done(ret); -- cgit v1.2.3 From c0fa7577022c4169e1aaaf1bd9e04f63d285beb2 Mon Sep 17 00:00:00 2001 From: Ethan Chen Date: Sat, 20 Jan 2018 16:35:53 -0800 Subject: sched/walt: Re-add code to allow WALT to function Change-Id: Ieb1067c5e276f872ed4c722b7d1fabecbdad87e7 --- include/linux/sched.h | 9 +++++++++ include/linux/sched/sysctl.h | 6 ++++++ include/trace/events/sched.h | 7 +++++-- kernel/sched/Makefile | 1 + kernel/sched/core.c | 26 ++++++++++++++++++++++++++ kernel/sched/cputime.c | 6 ++++-- kernel/sched/fair.c | 22 ++++++++++++++++++++++ kernel/sched/rt.c | 3 +++ kernel/sched/sched.h | 5 +++++ kernel/sched/stop_task.c | 3 +++ kernel/sched/tune.c | 2 ++ kernel/sched/walt.c | 5 ----- kernel/sched/walt.h | 2 ++ kernel/sysctl.c | 30 ++++++++++++++++++++++++++++++ 14 files changed, 118 insertions(+), 9 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 0e8fff43cc17..4e212132a274 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1668,6 +1668,15 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; +#ifdef CONFIG_SCHED_WALT + struct ravg ravg; + /* + * 'init_load_pct' represents the initial task load assigned to children + * of this task + */ + u32 init_load_pct; + u64 last_sleep_ts; +#endif #ifdef CONFIG_SCHED_HMP struct ravg ravg; /* diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 1e1fcb8791a7..c85fe9872d07 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -41,6 +41,12 @@ extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_sync_hint_enable; extern unsigned int sysctl_sched_cstate_aware; +#ifdef CONFIG_SCHED_WALT +extern unsigned int sysctl_sched_use_walt_cpu_util; +extern unsigned int sysctl_sched_use_walt_task_util; +extern unsigned int sysctl_sched_walt_init_task_load_pct; +extern unsigned int sysctl_sched_walt_cpu_high_irqload; +#endif #ifdef CONFIG_SCHED_HMP diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 73cd7e502d4c..70d6012c89aa 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -1903,6 +1903,7 @@ TRACE_EVENT(walt_update_task_ravg, __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( pid_t, cur_pid ) + __field(unsigned int, cur_freq ) __field( u64, wallclock ) __field( u64, mark_start ) __field( u64, delta_m ) @@ -1930,6 +1931,7 @@ TRACE_EVENT(walt_update_task_ravg, __entry->evt = evt; __entry->cpu = rq->cpu; __entry->cur_pid = rq->curr->pid; + __entry->cur_freq = rq->cur_freq; memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->mark_start = p->ravg.mark_start; @@ -1948,10 +1950,11 @@ TRACE_EVENT(walt_update_task_ravg, __entry->active_windows = p->ravg.active_windows; ), - TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu" + TP_printk("wc %llu ws %llu delta %llu event %d cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u sum %u irqtime %llu" " cs %llu ps %llu util %lu cur_window %u prev_window %u active_wins %u" , __entry->wallclock, __entry->win_start, __entry->delta, - __entry->evt, __entry->cpu, __entry->cur_pid, + __entry->evt, __entry->cpu, + __entry->cur_freq, __entry->cur_pid, __entry->pid, __entry->comm, __entry->mark_start, __entry->delta_m, __entry->demand, __entry->sum, __entry->irqtime, diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7dde1b9918e4..ea301717538f 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,6 +19,7 @@ obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-y += wait.o completion.o idle.o sched_avg.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o +obj-$(CONFIG_SCHED_WALT) += walt.o obj-$(CONFIG_SCHED_HMP) += hmp.o boost.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c597f6c61115..17c13347d703 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -98,6 +98,7 @@ #define CREATE_TRACE_POINTS #include +#include "walt.h" ATOMIC_NOTIFIER_HEAD(load_alert_notifier_head); @@ -1389,6 +1390,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->se.nr_migrations++; perf_event_task_migrate(p); + walt_fixup_busy_time(p, new_cpu); fixup_busy_time(p, new_cpu); } @@ -2155,6 +2157,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, raw_spin_lock(&rq->lock); old_load = task_load(p); + wallclock = walt_ktime_clock(); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); wallclock = sched_ktime_clock(); update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); @@ -2253,6 +2258,11 @@ static void try_to_wake_up_local(struct task_struct *p) update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); cpufreq_update_util(rq, 0); + + wallclock = walt_ktime_clock(); + + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); ttwu_activate(rq, p, ENQUEUE_WAKEUP); note_task_waking(p, wallclock); } @@ -2385,6 +2395,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif INIT_LIST_HEAD(&p->se.group_node); + walt_init_new_task_load(p); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; @@ -2669,6 +2680,7 @@ void wake_up_new_task(struct task_struct *p) struct rq *rq; add_new_task_to_grp(p); + walt_init_new_task_load(p); raw_spin_lock_irqsave(&p->pi_lock, flags); p->state = TASK_RUNNING; @@ -2687,6 +2699,7 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p); mark_task_starting(p); + walt_mark_task_starting(p); update_rq_clock(rq); post_init_entity_util_avg(&p->se); activate_task(rq, p, ENQUEUE_WAKEUP_NEW); @@ -3235,10 +3248,13 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); old_load = task_load(curr); + walt_set_window_start(rq); set_window_start(rq); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, + walt_ktime_clock(), 0); calc_global_load_tick(rq); wallclock = sched_ktime_clock(); update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); @@ -3602,6 +3618,9 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); next = pick_next_task(rq, prev); + wallclock = walt_ktime_clock(); + walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); + walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); clear_tsk_need_resched(prev); clear_preempt_need_resched(); rq->clock_skip_update = 0; @@ -6364,6 +6383,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_UP_PREPARE: raw_spin_lock_irqsave(&rq->lock, flags); + walt_set_window_start(rq); set_window_start(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); rq->calc_load_update = calc_load_update; @@ -6385,6 +6405,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); + walt_migrate_sync_cpu(cpu); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); @@ -8580,6 +8601,11 @@ void __init sched_init(void) } #endif rq->max_idle_balance_cost = sysctl_sched_migration_cost; +#ifdef CONFIG_SCHED_WALT + rq->cur_irqload = 0; + rq->avg_irqload = 0; + rq->irqload_ts = 0; +#endif INIT_LIST_HEAD(&rq->cfs_tasks); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e6ec68c15aa3..cf6729cb46dd 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -6,6 +6,7 @@ #include #include #include "sched.h" +#include "walt.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -79,9 +80,10 @@ void irqtime_account_irq(struct task_struct *curr) irq_time_write_end(); - if (account) + if (account) { + walt_account_irqtime(cpu, curr, delta, wallclock); sched_account_irqtime(cpu, curr, delta, wallclock); - else if (curr != this_cpu_ksoftirqd()) + } else if (curr != this_cpu_ksoftirqd()) sched_account_irqstart(cpu, curr, wallclock); local_irq_restore(flags); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a2f52c35c76a..08e608a04f5b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -55,6 +55,12 @@ unsigned int normalized_sysctl_sched_latency = 6000000ULL; unsigned int sysctl_sched_sync_hint_enable = 1; unsigned int sysctl_sched_cstate_aware = 1; +#ifdef CONFIG_SCHED_WALT +unsigned int sysctl_sched_use_walt_cpu_util = 1; +unsigned int sysctl_sched_use_walt_task_util = 1; +__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = + (10 * NSEC_PER_MSEC); +#endif /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -5961,6 +5967,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); inc_cfs_rq_hmp_stats(cfs_rq, p, 1); flags = ENQUEUE_WAKEUP; @@ -5969,6 +5976,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; + walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p); inc_cfs_rq_hmp_stats(cfs_rq, p, 1); if (cfs_rq_throttled(cfs_rq)) @@ -6005,6 +6013,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) schedtune_enqueue_task(p, cpu_of(rq)); if (energy_aware() && !se) { + walt_inc_cumulative_runnable_avg(rq, p); if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) { rq->rd->overutilized = true; @@ -6042,6 +6051,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); dec_cfs_rq_hmp_stats(cfs_rq, p, 1); /* Don't dequeue parent if it has other entities besides us */ @@ -6062,6 +6072,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; + walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); dec_cfs_rq_hmp_stats(cfs_rq, p, 1); if (cfs_rq_throttled(cfs_rq)) @@ -7098,6 +7109,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, static inline unsigned long task_util(struct task_struct *p) { +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + unsigned long demand = p->ravg.demand; + return (demand << 10) / walt_ravg_window; + } +#endif return p->se.avg.util_avg; } @@ -7656,6 +7673,11 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, if (new_util > capacity_orig) continue; +#ifdef CONFIG_SCHED_WALT + if (walt_cpu_high_irqload(i)) + continue; +#endif + /* * Case A) Latency sensitive tasks * diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 391ec29c71c0..2083a54cdd49 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -12,6 +12,7 @@ #include #include "tune.h" +#include "walt.h" int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; @@ -1449,6 +1450,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) rt_se->timeout = 0; enqueue_rt_entity(rt_se, flags); + walt_inc_cumulative_runnable_avg(rq, p); inc_hmp_sched_stats_rt(rq, p); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) @@ -1488,6 +1490,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) update_curr_rt(rq); dequeue_rt_entity(rt_se, flags); + walt_dec_cumulative_runnable_avg(rq, p); dec_hmp_sched_stats_rt(rq, p); dequeue_pushable_task(rq, p); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1196276eddf6..284cc86d3ad4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -511,6 +511,10 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ +#ifdef CONFIG_SCHED_WALT + u64 cumulative_runnable_avg; +#endif + #ifdef CONFIG_CFS_BANDWIDTH #ifdef CONFIG_SCHED_HMP @@ -819,6 +823,7 @@ struct rq { #endif #ifdef CONFIG_SCHED_WALT + unsigned int cur_freq; u64 cumulative_runnable_avg; u64 window_start; u64 curr_runnable_sum; diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 3278c81cefb1..0fa11d86599e 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -1,4 +1,5 @@ #include "sched.h" +#include "walt.h" /* * stop-task scheduling class. @@ -78,6 +79,7 @@ static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { add_nr_running(rq, 1); + walt_inc_cumulative_runnable_avg(rq, p); inc_hmp_sched_stats_stop(rq, p); } @@ -85,6 +87,7 @@ static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); + walt_dec_cumulative_runnable_avg(rq, p); dec_hmp_sched_stats_stop(rq, p); } diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index d8beda0d74b4..d0ef97f484b1 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -770,6 +770,7 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, static void schedtune_attach(struct cgroup_taskset *tset) { +#ifdef CONFIG_SCHED_HMP struct task_struct *task; struct cgroup_subsys_state *css; struct schedtune *st; @@ -782,6 +783,7 @@ static void schedtune_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(task, css, tset) sync_cgroup_colocation(task, colocate); +#endif } static struct cftype files[] = { diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 8d25ffbe4fed..911606537808 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -57,11 +57,6 @@ static unsigned int sync_cpu; static ktime_t ktime_last; static bool walt_ktime_suspended; -static unsigned int task_load(struct task_struct *p) -{ - return p->ravg.demand; -} - static inline void fixup_cum_window_demand(struct rq *rq, s64 delta) { rq->cum_window_demand += delta; diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h index de7edac43674..34c72a0fcf39 100644 --- a/kernel/sched/walt.h +++ b/kernel/sched/walt.h @@ -54,6 +54,8 @@ static inline void walt_set_window_start(struct rq *rq) { } static inline void walt_migrate_sync_cpu(int cpu) { } static inline void walt_init_cpu_efficiency(void) { } static inline u64 walt_ktime_clock(void) { return 0; } +static inline void walt_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) { } #define walt_cpu_high_irqload(cpu) false diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e2aab9cf058b..8980bdffde3d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -575,6 +575,36 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHED_WALT + { + .procname = "sched_use_walt_cpu_util", + .data = &sysctl_sched_use_walt_cpu_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_use_walt_task_util", + .data = &sysctl_sched_use_walt_task_util, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_walt_init_task_load_pct", + .data = &sysctl_sched_walt_init_task_load_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_walt_cpu_high_irqload", + .data = &sysctl_sched_walt_cpu_high_irqload, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif { .procname = "sched_cstate_aware", .data = &sysctl_sched_cstate_aware, -- cgit v1.2.3 From 33d3b17bfdfb68760bc3fdf79748f8b65ce71978 Mon Sep 17 00:00:00 2001 From: Rashed Abdel-Tawab Date: Thu, 18 Jan 2018 12:49:34 -0800 Subject: ARM: dts: msm: Add msm8998 energy model Squash of commits: ed6442938f08: Enable EAS in 8998 MTP 3989a0e22e44: Update Energy Model using Muskie 922c6f4b9e8b: Added idle-cost-data to energy model and fixed busy-cost-data for big cluster cpus Change-Id: I717eb88204f5e28a1afd494dc484895cc749e2fc --- arch/arm/boot/dts/qcom/msm8998.dtsi | 136 ++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/arch/arm/boot/dts/qcom/msm8998.dtsi b/arch/arm/boot/dts/qcom/msm8998.dtsi index ae664e48afff..89936736e0ff 100644 --- a/arch/arm/boot/dts/qcom/msm8998.dtsi +++ b/arch/arm/boot/dts/qcom/msm8998.dtsi @@ -49,6 +49,7 @@ qcom,lmh-dcvs = <&lmh_dcvs0>; enable-method = "psci"; efficiency = <1024>; + sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>; next-level-cache = <&L2_0>; qcom,ea = <&ea0>; L2_0: l2-cache { @@ -77,6 +78,7 @@ qcom,lmh-dcvs = <&lmh_dcvs0>; enable-method = "psci"; efficiency = <1024>; + sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>; next-level-cache = <&L2_0>; qcom,ea = <&ea1>; L1_I_1: l1-icache { @@ -100,6 +102,7 @@ qcom,lmh-dcvs = <&lmh_dcvs0>; enable-method = "psci"; efficiency = <1024>; + sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>; next-level-cache = <&L2_0>; qcom,ea = <&ea2>; L1_I_2: l1-icache { @@ -123,6 +126,7 @@ qcom,lmh-dcvs = <&lmh_dcvs0>; enable-method = "psci"; efficiency = <1024>; + sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>; next-level-cache = <&L2_0>; qcom,ea = <&ea3>; L1_I_3: l1-icache { @@ -146,6 +150,7 @@ qcom,lmh-dcvs = <&lmh_dcvs1>; enable-method = "psci"; efficiency = <1536>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; next-level-cache = <&L2_1>; qcom,ea = <&ea4>; L2_1: l2-cache { @@ -173,6 +178,7 @@ qcom,lmh-dcvs = <&lmh_dcvs1>; enable-method = "psci"; efficiency = <1536>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; next-level-cache = <&L2_1>; qcom,ea = <&ea5>; L1_I_101: l1-icache { @@ -196,6 +202,7 @@ qcom,lmh-dcvs = <&lmh_dcvs1>; enable-method = "psci"; efficiency = <1536>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; next-level-cache = <&L2_1>; qcom,ea = <&ea6>; L1_I_102: l1-icache { @@ -219,6 +226,7 @@ qcom,lmh-dcvs = <&lmh_dcvs1>; enable-method = "psci"; efficiency = <1536>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; next-level-cache = <&L2_1>; qcom,ea = <&ea7>; L1_I_103: l1-icache { @@ -271,6 +279,134 @@ }; }; }; + energy-costs { + CPU_COST_0: core-cost0 { + busy-cost-data = < + 68 11 + 83 20 + 101 25 + 118 30 + 136 33 + 154 36 + 155 36 + 189 42 + 203 47 + 220 54 + 239 62 + 226 67 + 270 73 + 287 79 + 305 88 + 322 95 + 342 104 + 358 111 + 385 134 + 402 155 + 420 178 + 438 201 + >; + idle-cost-data = < + 4 4 0 + >; + }; + CPU_COST_1: core-cost1 { + busy-cost-data = < + 132 56 + 153 76 + 186 91 + 220 105 + 254 118 + 289 135 + 323 150 + 357 162 + 400 181 + 435 196 + 467 214 + 502 229 + 528 248 + 561 280 + 596 316 + 630 354 + 664 392 + 698 439 + 733 495 + 767 565 + 800 622 + 833 691 + 869 792 + 903 889 + 938 1024 + 1020 1141 + 1024 1138 + >; + idle-cost-data = < + 10 10 0 + >; + }; + CLUSTER_COST_0: cluster-cost0 { + busy-cost-data = < + 68 17 + 83 18 + 101 18 + 118 20 + 136 21 + 154 23 + 155 23 + 189 24 + 203 27 + 220 29 + 239 30 + 226 32 + 270 33 + 287 35 + 305 38 + 322 39 + 342 42 + 358 46 + 385 48 + 402 53 + 420 59 + 438 66 + >; + idle-cost-data = < + 31 31 31 0 + >; + }; + CLUSTER_COST_1: cluster-cost1 { + busy-cost-data = < + 132 24 + 153 25 + 186 26 + 220 29 + 254 30 + 289 33 + 323 35 + 357 37 + 400 38 + 435 40 + 467 43 + 502 44 + 528 46 + 561 50 + 596 54 + 630 60 + 664 63 + 698 70 + 733 74 + 767 80 + 800 87 + 833 96 + 869 104 + 903 120 + 938 130 + 1020 203 + 1024 203 + >; + idle-cost-data = < + 50 50 50 0 + >; + }; + }; }; soc: soc { }; -- cgit v1.2.3 From 83dcbae147826d778c4edd1f3259de7f1c28403d Mon Sep 17 00:00:00 2001 From: Siqi Lin Date: Thu, 27 Apr 2017 09:43:56 -0700 Subject: ARM: dts: msm: Fix EAS idle-cost-data property length We need 4 idle-cost-data for CPUs, despite cpu_idle supporting only 3 different idle states. The idle-cost-data property length should always be one more entry longer than the number of available cpu_idle states. The idle-cost-data property has to have the same length for both CLUSTER_COST_N and CPU_COST_N. Bug: 37641804 Change-Id: Ic14a6a1ef4409e81c5adc23575f7d1157d6eadce Signed-off-by: Siqi Lin --- arch/arm/boot/dts/qcom/msm8998.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/qcom/msm8998.dtsi b/arch/arm/boot/dts/qcom/msm8998.dtsi index 89936736e0ff..0f8a46a05739 100644 --- a/arch/arm/boot/dts/qcom/msm8998.dtsi +++ b/arch/arm/boot/dts/qcom/msm8998.dtsi @@ -306,7 +306,7 @@ 438 201 >; idle-cost-data = < - 4 4 0 + 4 4 4 0 >; }; CPU_COST_1: core-cost1 { @@ -340,7 +340,7 @@ 1024 1138 >; idle-cost-data = < - 10 10 0 + 10 10 10 0 >; }; CLUSTER_COST_0: cluster-cost0 { -- cgit v1.2.3 From ab88411382f7c022679edcaa1e90b51947f91401 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 18 May 2017 11:38:57 +0100 Subject: ARM: dts: msm: fix EM to be monotonically increasing Change-Id: Iad2e3882a2e9d7dbbfd80cf485bbb1f0e664b04f Signed-off-by: Patrick Bellasi --- arch/arm/boot/dts/qcom/msm8998.dtsi | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm/boot/dts/qcom/msm8998.dtsi b/arch/arm/boot/dts/qcom/msm8998.dtsi index 0f8a46a05739..e6c4c6f1864f 100644 --- a/arch/arm/boot/dts/qcom/msm8998.dtsi +++ b/arch/arm/boot/dts/qcom/msm8998.dtsi @@ -287,13 +287,13 @@ 101 25 118 30 136 33 - 154 36 + 154 35 155 36 189 42 203 47 220 54 239 62 - 226 67 + 255 67 270 73 287 79 305 88 @@ -350,13 +350,13 @@ 101 18 118 20 136 21 - 154 23 + 154 22 155 23 189 24 203 27 220 29 239 30 - 226 32 + 255 32 270 33 287 35 305 38 @@ -399,7 +399,7 @@ 869 104 903 120 938 130 - 1020 203 + 1020 200 1024 203 >; idle-cost-data = < -- cgit v1.2.3 From 72f13941085b9b9f3b8cc6f6d84016ca44093265 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 18 May 2017 11:41:03 +0100 Subject: ARM: dts: msm: fix CPU's idle-cost-data CPU idle states are mapped into EAS energy model data structures according to this table: + cpu_idle_status | + idle-cost-data index | | + meaning | | | + expected energy cost | | | | -1 0: CPU active CPU energy > 0 0 1: CPU WFI CPU energy > 0 1 2: CPU off (cluster on) CPU energy = 0 2 3: CPU off (cluster off) CPU energy = 0 Change-Id: I4b51bb74cb96c265731f3872c95947474db973ac Signed-off-by: Patrick Bellasi --- arch/arm/boot/dts/qcom/msm8998.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/qcom/msm8998.dtsi b/arch/arm/boot/dts/qcom/msm8998.dtsi index e6c4c6f1864f..fb601f9a7bbc 100644 --- a/arch/arm/boot/dts/qcom/msm8998.dtsi +++ b/arch/arm/boot/dts/qcom/msm8998.dtsi @@ -306,7 +306,7 @@ 438 201 >; idle-cost-data = < - 4 4 4 0 + 4 4 0 0 >; }; CPU_COST_1: core-cost1 { @@ -340,7 +340,7 @@ 1024 1138 >; idle-cost-data = < - 10 10 10 0 + 10 10 0 0 >; }; CLUSTER_COST_0: cluster-cost0 { -- cgit v1.2.3 From b65c91c9aa14ce6e8d61d488878c0dc734508a76 Mon Sep 17 00:00:00 2001 From: Andres Oportus Date: Fri, 25 Aug 2017 16:46:36 -0700 Subject: ARM: dts: msm: add HW CPU's busy-cost-data for additional freqs Initial Enery Model was calculated with a device including less number of available frequencies. This change adds the missing values, note that all performance values had to be updated so they would be re-normalized to 0-1024. Bug: 64837462 Test: YouTube did not have energy regression Change-Id: I2b4c62d06e39fe0da524af96568187042664d62a Signed-off-by: Andres Oportus --- arch/arm/boot/dts/qcom/msm8998.dtsi | 204 +++++++++++++++++++----------------- 1 file changed, 106 insertions(+), 98 deletions(-) diff --git a/arch/arm/boot/dts/qcom/msm8998.dtsi b/arch/arm/boot/dts/qcom/msm8998.dtsi index fb601f9a7bbc..b9a38ddc5ba8 100644 --- a/arch/arm/boot/dts/qcom/msm8998.dtsi +++ b/arch/arm/boot/dts/qcom/msm8998.dtsi @@ -282,28 +282,28 @@ energy-costs { CPU_COST_0: core-cost0 { busy-cost-data = < - 68 11 - 83 20 - 101 25 - 118 30 - 136 33 - 154 35 - 155 36 - 189 42 - 203 47 - 220 54 - 239 62 - 255 67 - 270 73 - 287 79 - 305 88 - 322 95 - 342 104 - 358 111 - 385 134 - 402 155 - 420 178 - 438 201 + 65 11 + 80 20 + 96 25 + 113 30 + 130 33 + 147 35 + 164 36 + 181 42 + 194 47 + 211 54 + 228 62 + 243 67 + 258 73 + 275 79 + 292 88 + 308 95 + 326 104 + 342 111 + 368 134 + 384 155 + 401 178 + 419 201 >; idle-cost-data = < 4 4 0 0 @@ -311,62 +311,66 @@ }; CPU_COST_1: core-cost1 { busy-cost-data = < - 132 56 - 153 76 - 186 91 - 220 105 - 254 118 - 289 135 - 323 150 - 357 162 - 400 181 - 435 196 - 467 214 - 502 229 - 528 248 - 561 280 - 596 316 - 630 354 - 664 392 - 698 439 - 733 495 - 767 565 - 800 622 - 833 691 - 869 792 - 903 889 - 938 1024 - 1020 1141 - 1024 1138 + 129 56 + 148 76 + 182 91 + 216 105 + 247 118 + 278 135 + 312 150 + 344 162 + 391 181 + 419 196 + 453 214 + 487 229 + 509 248 + 546 280 + 581 316 + 615 354 + 650 392 + 676 439 + 712 495 + 739 565 + 776 622 + 803 691 + 834 792 + 881 889 + 914 1059 + 957 1244 + 975 1375 + 996 1549 + 1016 1617 + 1021 1677 + 1024 1683 >; idle-cost-data = < - 10 10 0 0 + 10 10 0 0 >; }; CLUSTER_COST_0: cluster-cost0 { busy-cost-data = < - 68 17 - 83 18 - 101 18 - 118 20 - 136 21 - 154 22 - 155 23 - 189 24 - 203 27 - 220 29 - 239 30 - 255 32 - 270 33 - 287 35 - 305 38 - 322 39 - 342 42 - 358 46 - 385 48 - 402 53 - 420 59 - 438 66 + 65 17 + 80 18 + 96 18 + 113 20 + 130 21 + 147 22 + 164 23 + 181 24 + 194 27 + 211 29 + 228 30 + 243 32 + 258 33 + 275 35 + 292 38 + 308 39 + 326 42 + 342 46 + 368 48 + 384 53 + 401 59 + 419 66 >; idle-cost-data = < 31 31 31 0 @@ -374,32 +378,36 @@ }; CLUSTER_COST_1: cluster-cost1 { busy-cost-data = < - 132 24 - 153 25 - 186 26 - 220 29 - 254 30 - 289 33 - 323 35 - 357 37 - 400 38 - 435 40 - 467 43 - 502 44 - 528 46 - 561 50 - 596 54 - 630 60 - 664 63 - 698 70 - 733 74 - 767 80 - 800 87 - 833 96 - 869 104 - 903 120 - 938 130 - 1020 200 + 129 24 + 148 25 + 182 26 + 216 29 + 247 30 + 278 33 + 312 35 + 344 37 + 391 38 + 419 40 + 453 43 + 487 44 + 509 46 + 546 50 + 581 54 + 615 60 + 650 63 + 676 70 + 712 74 + 739 80 + 776 87 + 803 96 + 834 104 + 881 120 + 914 130 + 957 171 + 975 178 + 996 185 + 1016 200 + 1021 202 1024 203 >; idle-cost-data = < -- cgit v1.2.3 From 9539942cb065e9ec5749a89077b12fc3a0c51b0a Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 18 May 2017 22:41:33 -0700 Subject: FROMLIST: cpufreq: Make iowait boost a policy option Make iowait boost a cpufreq policy option and enable it for intel_pstate cpufreq driver. Governors like schedutil can use it to determine if boosting for tasks that wake up with p->in_iowait set is needed. Bug: 38010527 Link: https://lkml.org/lkml/2017/5/19/43 Change-Id: Icf59e75fbe731dc67abb28fb837f7bb0cd5ec6cc Signed-off-by: Joel Fernandes --- drivers/cpufreq/intel_pstate.c | 1 + include/linux/cpufreq.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 15fcf2cac971..53226f33ea98 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1187,6 +1187,7 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy) policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling; policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling; + policy->iowait_boost_enable = true; /* cpuinfo and default policy values */ policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 9302d016b89f..8e9d08dfbd18 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -123,6 +123,9 @@ struct cpufreq_policy { unsigned int up_transition_delay_us; unsigned int down_transition_delay_us; + /* Boost switch for tasks with p->in_iowait set */ + bool iowait_boost_enable; + /* Cached frequency lookup from cpufreq_driver_resolve_freq. */ unsigned int cached_target_freq; int cached_resolved_idx; -- cgit v1.2.3 From ff383d94478af0bb62f828bad550e42681a7176e Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 18 May 2017 22:46:10 -0700 Subject: FROMLIST: sched: Make iowait_boost optional in schedutil We should apply the iowait boost only if cpufreq policy has iowait boost enabled. Also make it a schedutil configuration from sysfs so it can be turned on/off if needed (by default initialize it to the policy value). For systems that don't need/want it enabled, such as those on arm64 based mobile devices that are battery operated, it saves energy when the cpufreq driver policy doesn't have it enabled (details below): Here are some results for energy measurements collected running a YouTube video for 30 seconds: Before: 8.042533 mWh After: 7.948377 mWh Energy savings is ~1.2% Bug: 38010527 Link: https://lkml.org/lkml/2017/5/19/42 Change-Id: If124076ad0c16ade369253840dedfbf870aff927 Signed-off-by: Joel Fernandes --- kernel/sched/cpufreq_schedutil.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 6c84b4d28914..6effb44aeb30 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -33,6 +33,7 @@ struct sugov_tunables { struct gov_attr_set attr_set; unsigned int up_rate_limit_us; unsigned int down_rate_limit_us; + bool iowait_boost_enable; }; struct sugov_policy { @@ -228,6 +229,11 @@ static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time) static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) { + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + + if (!sg_policy->tunables->iowait_boost_enable) + return; + if (flags & SCHED_CPUFREQ_IOWAIT) { if (sg_cpu->iowait_boost_pending) return; @@ -510,12 +516,36 @@ static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set, return count; } +static ssize_t iowait_boost_enable_show(struct gov_attr_set *attr_set, + char *buf) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + + return sprintf(buf, "%u\n", tunables->iowait_boost_enable); +} + +static ssize_t iowait_boost_enable_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + bool enable; + + if (kstrtobool(buf, &enable)) + return -EINVAL; + + tunables->iowait_boost_enable = enable; + + return count; +} + static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us); static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us); +static struct governor_attr iowait_boost_enable = __ATTR_RW(iowait_boost_enable); static struct attribute *sugov_attributes[] = { &up_rate_limit_us.attr, &down_rate_limit_us.attr, + &iowait_boost_enable.attr, NULL }; @@ -675,6 +705,8 @@ static int sugov_init(struct cpufreq_policy *policy) } } + tunables->iowait_boost_enable = policy->iowait_boost_enable; + policy->governor_data = sg_policy; sg_policy->tunables = tunables; -- cgit v1.2.3 From ebdb82f7b34aeab34623d7a5e4dd673fc2807842 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 20 Jul 2017 23:46:56 -0700 Subject: sched/fair: Skip frequency updates if CPU about to idle If CPU is about to idle, prevent a frequency update. With the number of schedutil governor wake ups are reduced by more than half on a test playing bluetooth audio. Test: sugov wake ups drop by more than half when playing music with screen off (476 / 1092) Bug: 64689959 Change-Id: I400026557b4134c0ac77f51c79610a96eb985b4a Signed-off-by: Joel Fernandes --- kernel/sched/fair.c | 18 +++++++++++++++--- kernel/sched/sched.h | 1 + 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 08e608a04f5b..ee5f8e686a31 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4332,6 +4332,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) */ #define UPDATE_TG 0x1 #define SKIP_AGE_LOAD 0x2 +#define SKIP_CPUFREQ 0x4 /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct sched_entity *se, int flags) @@ -4352,7 +4353,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags) cfs_rq->curr == se, NULL); } - decayed = update_cfs_rq_load_avg(now, cfs_rq, true); + decayed = update_cfs_rq_load_avg(now, cfs_rq, !(flags & SKIP_CPUFREQ)); decayed |= propagate_entity_load_avg(se); if (decayed && (flags & UPDATE_TG)) @@ -4528,6 +4529,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 +#define SKIP_CPUFREQ 0x3 static inline void update_load_avg(struct sched_entity *se, int not_used1){} static inline void @@ -4750,6 +4752,8 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + int update_flags; + /* * Update run-time statistics of the 'current'. */ @@ -4763,7 +4767,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ - update_load_avg(se, UPDATE_TG); + update_flags = UPDATE_TG; + + if (flags & DEQUEUE_IDLE) + update_flags |= SKIP_CPUFREQ; + + update_load_avg(se, update_flags); dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); @@ -6038,6 +6047,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + if (task_sleep && rq->nr_running == 1) + flags |= DEQUEUE_IDLE; + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); @@ -6078,7 +6090,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, UPDATE_TG); + update_load_avg(se, UPDATE_TG | (flags & DEQUEUE_IDLE)); update_cfs_shares(se); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 284cc86d3ad4..bafa2931c898 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2032,6 +2032,7 @@ static const u32 prio_to_wmult[40] = { #define DEQUEUE_SLEEP 0x01 #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ +#define DEQUEUE_IDLE 0x80 /* The last dequeue before IDLE */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 -- cgit v1.2.3 From b775cb29f66382f04ba4c1e7ad385081a020269b Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 27 Feb 2018 15:29:09 +0000 Subject: ANDROID: Move schedtune en/dequeue before schedutil update triggers CPU rq util updates happen when rq signals are updated as part of enqueue and dequeue operations. Doing these updates triggers a call to the registered util update handler, which takes schedtune boosting into account. Enqueueing the task in the correct schedtune group after this happens means that we will potentially not see the boost for an entire throttle period. Move the enqueue/dequeue operations for schedtune before the signal updates which can trigger OPP changes. Change-Id: I4236e6b194bc5daad32ff33067d4be1987996780 Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 63 +++++++++++++++++++++++++---------------------------- kernel/sched/rt.c | 8 +++++++ 2 files changed, 38 insertions(+), 33 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ee5f8e686a31..ef6046d3a016 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5951,6 +5951,25 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; #ifdef CONFIG_SMP int task_new = flags & ENQUEUE_WAKEUP_NEW; + + /* + * Update SchedTune accounting. + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + * + * We do it also in the case where we enqueue a throttled task; + * we could argue that a throttled task should not boost a CPU, + * however: + * a) properly implementing CPU boosting considering throttled + * tasks will increase a lot the complexity of the solution + * b) it's not easy to quantify the benefits introduced by + * such a more complex solution. + * Thus, for the time being we go for the simple solution and boost + * also for throttled RQs. + */ + schedtune_enqueue_task(p, cpu_of(rq)); #endif /* @@ -6001,26 +6020,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP - - /* - * Update SchedTune accounting. - * - * We do it before updating the CPU capacity to ensure the - * boost value of the current task is accounted for in the - * selection of the OPP. - * - * We do it also in the case where we enqueue a throttled task; - * we could argue that a throttled task should not boost a CPU, - * however: - * a) properly implementing CPU boosting considering throttled - * tasks will increase a lot the complexity of the solution - * b) it's not easy to quantify the benefits introduced by - * such a more complex solution. - * Thus, for the time being we go for the simple solution and boost - * also for throttled RQs. - */ - schedtune_enqueue_task(p, cpu_of(rq)); - if (energy_aware() && !se) { walt_inc_cumulative_runnable_avg(rq, p); if (!task_new && !rq->rd->overutilized && @@ -6050,6 +6049,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (task_sleep && rq->nr_running == 1) flags |= DEQUEUE_IDLE; +#ifdef CONFIG_SMP + /* + * Update SchedTune accounting + * + * We do it before updating the CPU capacity to ensure the + * boost value of the current task is accounted for in the + * selection of the OPP. + */ + schedtune_dequeue_task(p, cpu_of(rq)); +#endif + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); @@ -6099,19 +6109,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) dec_rq_hmp_stats(rq, p, 1); } -#ifdef CONFIG_SMP - - /* - * Update SchedTune accounting - * - * We do it before updating the CPU capacity to ensure the - * boost value of the current task is accounted for in the - * selection of the OPP. - */ - schedtune_dequeue_task(p, cpu_of(rq)); - -#endif /* CONFIG_SMP */ - hrtick_update(rq); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2083a54cdd49..ac81704e14d9 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1446,6 +1446,10 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct sched_rt_entity *rt_se = &p->rt; +#ifdef CONFIG_SMP + schedtune_enqueue_task(p, cpu_of(rq)); +#endif + if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; @@ -1488,6 +1492,10 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct sched_rt_entity *rt_se = &p->rt; +#ifdef CONFIG_SMP + schedtune_dequeue_task(p, cpu_of(rq)); +#endif + update_curr_rt(rq); dequeue_rt_entity(rt_se, flags); walt_dec_cumulative_runnable_avg(rq, p); -- cgit v1.2.3 From 891a63210e1d65ed226050ce7921dcec210a671a Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Fri, 18 Aug 2017 13:29:46 -0700 Subject: sched/fair: Fix issue where frequency update not skipped This patch fixes one of the infrequent conditions in commit 54b6baeca500 ("sched/fair: Skip frequency updates if CPU about to idle") where we could have skipped a frequency update. The fix is to use the correct flag which skips freq updates. Note that this is a rare issue (can show up only during CFS throttling) and even then we just do an additional frequency update which we were doing anyway before the above patch. Bug: 64689959 Change-Id: I0117442f395cea932ad56617065151bdeb9a3b53 Signed-off-by: Joel Fernandes --- kernel/sched/fair.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ef6046d3a016..2015cf048a44 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4529,7 +4529,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 -#define SKIP_CPUFREQ 0x3 +#define SKIP_CPUFREQ 0x0 static inline void update_load_avg(struct sched_entity *se, int not_used1){} static inline void @@ -6092,6 +6092,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } for_each_sched_entity(se) { + int update_flags; + cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p); @@ -6100,7 +6102,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, UPDATE_TG | (flags & DEQUEUE_IDLE)); + update_flags = UPDATE_TG; + + if (flags & DEQUEUE_IDLE) + update_flags |= SKIP_CPUFREQ; + + update_load_avg(se, update_flags); update_cfs_shares(se); } -- cgit v1.2.3 From 629bfed360f99e297f7d8042955710aadbde2123 Mon Sep 17 00:00:00 2001 From: Raghavendra Kakarla Date: Fri, 1 Jun 2018 19:06:53 +0530 Subject: kernel: power: qos: remove check for core isolation while cluster LPMs Since all cores in a cluster are in isolation, PMQoS latency constraint set by clock driver to switch PLL is ignored. So, Cluster enter to L2PC and SPM is trying to disable the PLL and at same time clock driver trying to switch the PLL from other cluster which leads to the synchronization issues. Fix is although all cores are in isolation, honor PMQoS request for cluster LPMs. Change-Id: I4296e16ef4e9046d1fbe3b7378e9f61a2f11c74d Signed-off-by: Raghavendra Kakarla --- kernel/power/qos.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 49dc710d4a3a..3e3ae5ed8100 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -477,8 +477,6 @@ int pm_qos_request_for_cpumask(int pm_qos_class, struct cpumask *mask) val = c->default_value; for_each_cpu(cpu, mask) { - if (cpu_isolated(cpu)) - continue; switch (c->type) { case PM_QOS_MIN: -- cgit v1.2.3 From 82d3f23d6dc53c564c1c8550f9ee6ac72f85c004 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 20 Jun 2018 18:18:33 +0800 Subject: sched/fair: Fix bandwidth timer clock drift condition commit 512ac999d2755d2b7109e996a76b6fb8b888631d upstream. I noticed that cgroup task groups constantly get throttled even if they have low CPU usage, this causes some jitters on the response time to some of our business containers when enabling CPU quotas. It's very simple to reproduce: mkdir /sys/fs/cgroup/cpu/test cd /sys/fs/cgroup/cpu/test echo 100000 > cpu.cfs_quota_us echo $$ > tasks then repeat: cat cpu.stat | grep nr_throttled # nr_throttled will increase steadily After some analysis, we found that cfs_rq::runtime_remaining will be cleared by expire_cfs_rq_runtime() due to two equal but stale "cfs_{b|q}->runtime_expires" after period timer is re-armed. The current condition to judge clock drift in expire_cfs_rq_runtime() is wrong, the two runtime_expires are actually the same when clock drift happens, so this condtion can never hit. The orginal design was correctly done by this commit: a9cf55b28610 ("sched: Expire invalid runtime") ... but was changed to be the current implementation due to its locking bug. This patch introduces another way, it adds a new field in both structures cfs_rq and cfs_bandwidth to record the expiration update sequence, and uses them to figure out if clock drift happens (true if they are equal). Change-Id: I8168fe3b45785643536f289ea823d1a62d9d8ab2 Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) [alakeshh: backport: Fixed merge conflicts: - sched.h: Fix the indentation and order in which the variables are declared to match with coding style of the existing code in 4.14 Struct members of same type were declared in separate lines in upstream patch which has been changed back to having multiple members of same type in the same line. e.g. int a; int b; -> int a, b; ] Signed-off-by: Alakesh Haloi Reviewed-by: Ben Segall Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: # 4.14.x Fixes: 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period") Link: http://lkml.kernel.org/r/20180620101834.24455-1-xlpang@linux.alibaba.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/fair.c | 14 ++++++++------ kernel/sched/sched.h | 4 +++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2015cf048a44..63f9ad66cf11 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5068,6 +5068,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) now = sched_clock_cpu(smp_processor_id()); cfs_b->runtime = cfs_b->quota; cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); + cfs_b->expires_seq++; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -5090,6 +5091,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); u64 amount = 0, min_amount, expires; + int expires_seq; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -5106,6 +5108,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } + expires_seq = cfs_b->expires_seq; expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); @@ -5115,8 +5118,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) * spread between our sched_clock and the one on which runtime was * issued. */ - if ((s64)(expires - cfs_rq->runtime_expires) > 0) + if (cfs_rq->expires_seq != expires_seq) { + cfs_rq->expires_seq = expires_seq; cfs_rq->runtime_expires = expires; + } return cfs_rq->runtime_remaining > 0; } @@ -5142,12 +5147,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) * has not truly expired. * * Fortunately we can check determine whether this the case by checking - * whether the global deadline has advanced. It is valid to compare - * cfs_b->runtime_expires without any locks since we only care about - * exact equality, so a partial write will still work. + * whether the global deadline(cfs_b->expires_seq) has advanced. */ - - if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { + if (cfs_rq->expires_seq == cfs_b->expires_seq) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; } else { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bafa2931c898..eaf5d3af2e92 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -227,8 +227,9 @@ struct cfs_bandwidth { u64 quota, runtime; s64 hierarchical_quota; u64 runtime_expires; + int expires_seq; - int idle, period_active; + short idle, period_active; struct hrtimer period_timer, slack_timer; struct list_head throttled_cfs_rq; @@ -522,6 +523,7 @@ struct cfs_rq { #endif int runtime_enabled; + int expires_seq; u64 runtime_expires; s64 runtime_remaining; -- cgit v1.2.3 From b933e4d37bc023d27c7394626669bae0a201da52 Mon Sep 17 00:00:00 2001 From: Dave Chiluk Date: Tue, 23 Jul 2019 11:44:26 -0500 Subject: sched/fair: Fix low cpu usage with high throttling by removing expiration of cpu-local slices commit de53fd7aedb100f03e5d2231cfce0e4993282425 upstream. It has been observed, that highly-threaded, non-cpu-bound applications running under cpu.cfs_quota_us constraints can hit a high percentage of periods throttled while simultaneously not consuming the allocated amount of quota. This use case is typical of user-interactive non-cpu bound applications, such as those running in kubernetes or mesos when run on multiple cpu cores. This has been root caused to cpu-local run queue being allocated per cpu bandwidth slices, and then not fully using that slice within the period. At which point the slice and quota expires. This expiration of unused slice results in applications not being able to utilize the quota for which they are allocated. The non-expiration of per-cpu slices was recently fixed by 'commit 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift condition")'. Prior to that it appears that this had been broken since at least 'commit 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period")' which was introduced in v3.16-rc1 in 2014. That added the following conditional which resulted in slices never being expired. if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { /* extend local deadline, drift is bounded above by 2 ticks */ cfs_rq->runtime_expires += TICK_NSEC; Because this was broken for nearly 5 years, and has recently been fixed and is now being noticed by many users running kubernetes (https://github.com/kubernetes/kubernetes/issues/67577) it is my opinion that the mechanisms around expiring runtime should be removed altogether. This allows quota already allocated to per-cpu run-queues to live longer than the period boundary. This allows threads on runqueues that do not use much CPU to continue to use their remaining slice over a longer period of time than cpu.cfs_period_us. However, this helps prevent the above condition of hitting throttling while also not fully utilizing your cpu quota. This theoretically allows a machine to use slightly more than its allotted quota in some periods. This overflow would be bounded by the remaining quota left on each per-cpu runqueueu. This is typically no more than min_cfs_rq_runtime=1ms per cpu. For CPU bound tasks this will change nothing, as they should theoretically fully utilize all of their quota in each period. For user-interactive tasks as described above this provides a much better user/application experience as their cpu utilization will more closely match the amount they requested when they hit throttling. This means that cpu limits no longer strictly apply per period for non-cpu bound applications, but that they are still accurate over longer timeframes. This greatly improves performance of high-thread-count, non-cpu bound applications with low cfs_quota_us allocation on high-core-count machines. In the case of an artificial testcase (10ms/100ms of quota on 80 CPU machine), this commit resulted in almost 30x performance improvement, while still maintaining correct cpu quota restrictions. That testcase is available at https://github.com/indeedeng/fibtest. Fixes: 512ac999d275 ("sched/fair: Fix bandwidth timer clock drift condition") Change-Id: I7d7a39fb554ec0c31f9381f492165f43c70b3924 Signed-off-by: Dave Chiluk Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Ben Segall Cc: Ingo Molnar Cc: John Hammond Cc: Jonathan Corbet Cc: Kyle Anderson Cc: Gabriel Munos Cc: Peter Oskolkov Cc: Cong Wang Cc: Brendan Gregg Link: https://lkml.kernel.org/r/1563900266-19734-2-git-send-email-chiluk+linux@indeed.com Signed-off-by: Greg Kroah-Hartman --- Documentation/scheduler/sched-bwc.txt | 45 ++++++++++++++++++++++ kernel/sched/fair.c | 70 ++++------------------------------- kernel/sched/sched.h | 4 -- 3 files changed, 52 insertions(+), 67 deletions(-) diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt index f6b1873f68ab..de583fbbfe42 100644 --- a/Documentation/scheduler/sched-bwc.txt +++ b/Documentation/scheduler/sched-bwc.txt @@ -90,6 +90,51 @@ There are two ways in which a group may become throttled: In case b) above, even though the child may have runtime remaining it will not be allowed to until the parent's runtime is refreshed. +CFS Bandwidth Quota Caveats +--------------------------- +Once a slice is assigned to a cpu it does not expire. However all but 1ms of +the slice may be returned to the global pool if all threads on that cpu become +unrunnable. This is configured at compile time by the min_cfs_rq_runtime +variable. This is a performance tweak that helps prevent added contention on +the global lock. + +The fact that cpu-local slices do not expire results in some interesting corner +cases that should be understood. + +For cgroup cpu constrained applications that are cpu limited this is a +relatively moot point because they will naturally consume the entirety of their +quota as well as the entirety of each cpu-local slice in each period. As a +result it is expected that nr_periods roughly equal nr_throttled, and that +cpuacct.usage will increase roughly equal to cfs_quota_us in each period. + +For highly-threaded, non-cpu bound applications this non-expiration nuance +allows applications to briefly burst past their quota limits by the amount of +unused slice on each cpu that the task group is running on (typically at most +1ms per cpu or as defined by min_cfs_rq_runtime). This slight burst only +applies if quota had been assigned to a cpu and then not fully used or returned +in previous periods. This burst amount will not be transferred between cores. +As a result, this mechanism still strictly limits the task group to quota +average usage, albeit over a longer time window than a single period. This +also limits the burst ability to no more than 1ms per cpu. This provides +better more predictable user experience for highly threaded applications with +small quota limits on high core count machines. It also eliminates the +propensity to throttle these applications while simultanously using less than +quota amounts of cpu. Another way to say this, is that by allowing the unused +portion of a slice to remain valid across periods we have decreased the +possibility of wastefully expiring quota on cpu-local silos that don't need a +full slice's amount of cpu time. + +The interaction between cpu-bound and non-cpu-bound-interactive applications +should also be considered, especially when single core usage hits 100%. If you +gave each of these applications half of a cpu-core and they both got scheduled +on the same CPU it is theoretically possible that the non-cpu bound application +will use up to 1ms additional quota in some periods, thereby preventing the +cpu-bound application from fully using its quota by that same amount. In these +instances it will be up to the CFS algorithm (see sched-design-CFS.rst) to +decide which application is chosen to run, as they will both be runnable and +have remaining quota. This runtime discrepancy will be made up in the following +periods when the interactive application idles. + Examples -------- 1. Limit a group to 1 CPU worth of runtime. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 63f9ad66cf11..266fc95f6c0f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5067,8 +5067,6 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) now = sched_clock_cpu(smp_processor_id()); cfs_b->runtime = cfs_b->quota; - cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); - cfs_b->expires_seq++; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -5090,8 +5088,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { struct task_group *tg = cfs_rq->tg; struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); - u64 amount = 0, min_amount, expires; - int expires_seq; + u64 amount = 0, min_amount; /* note: this is a positive sum as runtime_remaining <= 0 */ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; @@ -5108,61 +5105,17 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) cfs_b->idle = 0; } } - expires_seq = cfs_b->expires_seq; - expires = cfs_b->runtime_expires; raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += amount; - /* - * we may have advanced our local expiration to account for allowed - * spread between our sched_clock and the one on which runtime was - * issued. - */ - if (cfs_rq->expires_seq != expires_seq) { - cfs_rq->expires_seq = expires_seq; - cfs_rq->runtime_expires = expires; - } return cfs_rq->runtime_remaining > 0; } -/* - * Note: This depends on the synchronization provided by sched_clock and the - * fact that rq->clock snapshots this value. - */ -static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - - /* if the deadline is ahead of our clock, nothing to do */ - if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0)) - return; - - if (cfs_rq->runtime_remaining < 0) - return; - - /* - * If the local deadline has passed we have to consider the - * possibility that our sched_clock is 'fast' and the global deadline - * has not truly expired. - * - * Fortunately we can check determine whether this the case by checking - * whether the global deadline(cfs_b->expires_seq) has advanced. - */ - if (cfs_rq->expires_seq == cfs_b->expires_seq) { - /* extend local deadline, drift is bounded above by 2 ticks */ - cfs_rq->runtime_expires += TICK_NSEC; - } else { - /* global deadline is ahead, expiration has passed */ - cfs_rq->runtime_remaining = 0; - } -} - static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { /* dock delta_exec before expiring quota (as it could span periods) */ cfs_rq->runtime_remaining -= delta_exec; - expire_cfs_rq_runtime(cfs_rq); if (likely(cfs_rq->runtime_remaining > 0)) return; @@ -5396,8 +5349,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) cpu_temp(cpu_of(rq))); } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, - u64 remaining, u64 expires) +static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) { struct cfs_rq *cfs_rq; u64 runtime; @@ -5418,7 +5370,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, remaining -= runtime; cfs_rq->runtime_remaining += runtime; - cfs_rq->runtime_expires = expires; /* we check whether we're throttled above */ if (cfs_rq->runtime_remaining > 0) @@ -5443,7 +5394,7 @@ next: */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { - u64 runtime, runtime_expires; + u64 runtime; int throttled; /* no need to continue the timer with no bandwidth constraint */ @@ -5471,8 +5422,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) /* account preceding periods in which throttling occurred */ cfs_b->nr_throttled += overrun; - runtime_expires = cfs_b->runtime_expires; - /* * This check is repeated as we are holding onto the new bandwidth while * we unthrottle. This can potentially race with an unthrottled group @@ -5485,8 +5434,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) cfs_b->distribute_running = 1; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ - runtime = distribute_cfs_runtime(cfs_b, runtime, - runtime_expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock(&cfs_b->lock); cfs_b->distribute_running = 0; @@ -5563,8 +5511,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) return; raw_spin_lock(&cfs_b->lock); - if (cfs_b->quota != RUNTIME_INF && - cfs_rq->runtime_expires == cfs_b->runtime_expires) { + if (cfs_b->quota != RUNTIME_INF) { cfs_b->runtime += slack_runtime; /* we are under rq->lock, defer unthrottling using a timer */ @@ -5596,7 +5543,6 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); - u64 expires; /* confirm we're still not at a refresh boundary */ raw_spin_lock(&cfs_b->lock); @@ -5613,7 +5559,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) runtime = cfs_b->runtime; - expires = cfs_b->runtime_expires; if (runtime) cfs_b->distribute_running = 1; @@ -5622,11 +5567,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (!runtime) return; - runtime = distribute_cfs_runtime(cfs_b, runtime, expires); + runtime = distribute_cfs_runtime(cfs_b, runtime); raw_spin_lock(&cfs_b->lock); - if (expires == cfs_b->runtime_expires) - cfs_b->runtime -= min(runtime, cfs_b->runtime); + cfs_b->runtime -= min(runtime, cfs_b->runtime); cfs_b->distribute_running = 0; raw_spin_unlock(&cfs_b->lock); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eaf5d3af2e92..4e1afb33166b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -226,8 +226,6 @@ struct cfs_bandwidth { ktime_t period; u64 quota, runtime; s64 hierarchical_quota; - u64 runtime_expires; - int expires_seq; short idle, period_active; struct hrtimer period_timer, slack_timer; @@ -523,8 +521,6 @@ struct cfs_rq { #endif int runtime_enabled; - int expires_seq; - u64 runtime_expires; s64 runtime_remaining; u64 throttled_clock, throttled_clock_task; -- cgit v1.2.3 From 997b726bc092e29a9c6bf5f8925c98defc69a6cd Mon Sep 17 00:00:00 2001 From: Frank Luo Date: Tue, 20 Nov 2018 15:33:34 +0800 Subject: kernel: power: Workaround for sensor ipc message causing high power consume Sync from Qcom's document KBA-180725024109 To avoid the non-wakeup type sensor data break the AP sleep flow, notify sensor subsystem in the first place of pm_suspend . Bug: 118418963 Test: measure power consumption after running test case Change-Id: I2848230d495e30ac462aef148b3f885103d9c24e Signed-off-by: Frank Luo --- drivers/soc/qcom/smp2p_sleepstate.c | 5 ++--- kernel/power/suspend.c | 6 ++++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/soc/qcom/smp2p_sleepstate.c b/drivers/soc/qcom/smp2p_sleepstate.c index 2ef25e48ce50..1f0809b61220 100644 --- a/drivers/soc/qcom/smp2p_sleepstate.c +++ b/drivers/soc/qcom/smp2p_sleepstate.c @@ -20,7 +20,8 @@ #define SET_DELAY (2 * HZ) #define PROC_AWAKE_ID 12 /* 12th bit */ -static int slst_gpio_base_id; +int slst_gpio_base_id; + /** * sleepstate_pm_notifier() - PM notifier callback function. @@ -36,13 +37,11 @@ static int sleepstate_pm_notifier(struct notifier_block *nb, { switch (event) { case PM_SUSPEND_PREPARE: - gpio_set_value(slst_gpio_base_id + PROC_AWAKE_ID, 0); msleep(25); /* To be tuned based on SMP2P latencies */ msm_ipc_router_set_ws_allowed(true); break; case PM_POST_SUSPEND: - gpio_set_value(slst_gpio_base_id + PROC_AWAKE_ID, 1); msleep(25); /* To be tuned based on SMP2P latencies */ msm_ipc_router_set_ws_allowed(false); break; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6e7832ee6d74..18c322fe8b73 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -34,6 +34,10 @@ #include "power.h" +#include +extern int slst_gpio_base_id; +#define PROC_AWAKE_ID 12 /* 12th bit */ + const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; const char *pm_states[PM_SUSPEND_MAX]; @@ -563,7 +567,9 @@ int pm_suspend(suspend_state_t state) return -EINVAL; pm_suspend_marker("entry"); + gpio_set_value(slst_gpio_base_id + PROC_AWAKE_ID, 0); error = enter_state(state); + gpio_set_value(slst_gpio_base_id + PROC_AWAKE_ID, 1); if (error) { suspend_stats.fail++; dpm_save_failed_errno(error); -- cgit v1.2.3 From 217ab2d0ef91df5539f055c07a7890153c5ce9a2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 24 May 2018 18:49:46 -0400 Subject: rcu: Speed up calling of RCU tasks callbacks Joel Fernandes found that the synchronize_rcu_tasks() was taking a significant amount of time. He demonstrated it with the following test: # cd /sys/kernel/tracing # while [ 1 ]; do x=1; done & # echo '__schedule_bug:traceon' > set_ftrace_filter # time echo '!__schedule_bug:traceon' > set_ftrace_filter; real 0m1.064s user 0m0.000s sys 0m0.004s Where it takes a little over a second to perform the synchronize, because there's a loop that waits 1 second at a time for tasks to get through their quiescent points when there's a task that must be waited for. After discussion we came up with a simple way to wait for holdouts but increase the time for each iteration of the loop but no more than a full second. With the new patch we have: # time echo '!__schedule_bug:traceon' > set_ftrace_filter; real 0m0.131s user 0m0.000s sys 0m0.004s Which drops it down to 13% of what the original wait time was. Link: http://lkml.kernel.org/r/20180523063815.198302-2-joel@joelfernandes.org Reported-by: Joel Fernandes (Google) Suggested-by: Joel Fernandes (Google) Change-Id: I40bcecdfdb2a1cdae7195f1d3b107455ea4b26b1 Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5f748c5a40f0..90fdf77dab7e 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -650,6 +650,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) struct rcu_head *list; struct rcu_head *next; LIST_HEAD(rcu_tasks_holdouts); + int fract; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ housekeeping_affine(current); @@ -731,13 +732,25 @@ static int __noreturn rcu_tasks_kthread(void *arg) * holdouts. When the list is empty, we are done. */ lastreport = jiffies; - while (!list_empty(&rcu_tasks_holdouts)) { + + /* Start off with HZ/10 wait and slowly back off to 1 HZ wait*/ + fract = 10; + + for (;;) { bool firstreport; bool needreport; int rtst; struct task_struct *t1; - schedule_timeout_interruptible(HZ); + if (list_empty(&rcu_tasks_holdouts)) + break; + + /* Slowly back off waiting for holdouts */ + schedule_timeout_interruptible(HZ/fract); + + if (fract > 1) + fract--; + rtst = READ_ONCE(rcu_task_stall_timeout); needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); -- cgit v1.2.3 From daaa5da96a74d35c64db2952add990213355ab4a Mon Sep 17 00:00:00 2001 From: Prasad Sodagudi Date: Fri, 15 Mar 2019 11:52:48 -0700 Subject: sched: Take irq_sparse lock during the isolation irq_migrate_all_off_this_cpu() is used to migrate IRQs and this function checks for all active irq in the allocated_irqs mask. irq_migrate_all_off_this_cpu() expects the caller to take irq_sparse lock to avoid race conditions while accessing allocated_irqs mask variable. Prevent a race between irq alloc/free and irq migration by adding irq_sparse lock across CPU isolation. Change-Id: I9edece1ecea45297c8f6529952d88b3133046467 Signed-off-by: Prasad Sodagudi --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 17c13347d703..61a80b81afcf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6024,7 +6024,9 @@ int sched_isolate_cpu(int cpu) smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1); smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1); + irq_lock_sparse(); stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0); + irq_unlock_sparse(); calc_load_migrate(rq); update_max_interval(); -- cgit v1.2.3 From 7d11b1a7a11c598a07687f853ded9eca97d89043 Mon Sep 17 00:00:00 2001 From: Georg Veichtlbauer Date: Wed, 26 Jul 2023 21:00:09 +0200 Subject: Revert "sched: cpufreq: Use sched_clock instead of rq_clock when updating schedutil" That commit should have changed rq_clock to sched_clock, instead of sched_ktime_clock, which kept schedutil from making correct decisions. This reverts commit ef3fb04c7df43dfa1793e33f764a2581cda96310. Change-Id: Id4118894388c33bf2b2d3d5ee27eb35e82dc4a96 --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4e1afb33166b..78ba150f2016 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2874,7 +2874,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, cpu_of(rq))); if (data) - data->func(data, sched_ktime_clock(), flags); + data->func(data, rq_clock(rq), flags); } static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) -- cgit v1.2.3