summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
authorRunmin Wang <runminw@codeaurora.org>2016-12-12 15:32:39 -0800
committerRunmin Wang <runminw@codeaurora.org>2016-12-16 13:52:17 -0800
commitefbe378b81e36d9ab6d3a2b3e0e2c3834c6a6528 (patch)
tree87312d7b72f387d44e4804561fc82fddf46fe10d /kernel/sched
parenta80e267a8c0d61790c3d1d5f7181ebd1be39c438 (diff)
parent0a9cbce798bca52bb3476cb55ce33da27e100f3e (diff)
Merge branch 'v4.4-16.09-android-tmp' into lsk-v4.4-16.09-android
* v4.4-16.09-android-tmp: unsafe_[get|put]_user: change interface to use a error target label usercopy: remove page-spanning test for now usercopy: fix overlap check for kernel text mm/slub: support left redzone Linux 4.4.21 lib/mpi: mpi_write_sgl(): fix skipping of leading zero limbs regulator: anatop: allow regulator to be in bypass mode hwrng: exynos - Disable runtime PM on probe failure cpufreq: Fix GOV_LIMITS handling for the userspace governor metag: Fix atomic_*_return inline asm constraints scsi: fix upper bounds check of sense key in scsi_sense_key_string() ALSA: timer: fix NULL pointer dereference on memory allocation failure ALSA: timer: fix division by zero after SNDRV_TIMER_IOCTL_CONTINUE ALSA: timer: fix NULL pointer dereference in read()/ioctl() race ALSA: hda - Enable subwoofer on Dell Inspiron 7559 ALSA: hda - Add headset mic quirk for Dell Inspiron 5468 ALSA: rawmidi: Fix possible deadlock with virmidi registration ALSA: fireworks: accessing to user space outside spinlock ALSA: firewire-tascam: accessing to user space outside spinlock ALSA: usb-audio: Add sample rate inquiry quirk for B850V3 CP2114 crypto: caam - fix IV loading for authenc (giv)decryption uprobes: Fix the memcg accounting x86/apic: Do not init irq remapping if ioapic is disabled vhost/scsi: fix reuse of &vq->iov[out] in response bcache: RESERVE_PRIO is too small by one when prio_buckets() is a power of two. ubifs: Fix assertion in layout_in_gaps() ovl: fix workdir creation ovl: listxattr: use strnlen() ovl: remove posix_acl_default from workdir ovl: don't copy up opaqueness wrappers for ->i_mutex access lustre: remove unused declaration timekeeping: Avoid taking lock in NMI path with CONFIG_DEBUG_TIMEKEEPING timekeeping: Cap array access in timekeeping_debug xfs: fix superblock inprogress check ASoC: atmel_ssc_dai: Don't unconditionally reset SSC on stream startup drm/msm: fix use of copy_from_user() while holding spinlock drm: Reject page_flip for !DRIVER_MODESET drm/radeon: fix radeon_move_blit on 32bit systems s390/sclp_ctl: fix potential information leak with /dev/sclp rds: fix an infoleak in rds_inc_info_copy powerpc/tm: Avoid SLB faults in treclaim/trecheckpoint when RI=0 nvme: Call pci_disable_device on the error path. cgroup: reduce read locked section of cgroup_threadgroup_rwsem during fork block: make sure a big bio is split into at most 256 bvecs block: Fix race triggered by blk_set_queue_dying() ext4: avoid modifying checksum fields directly during checksum verification ext4: avoid deadlock when expanding inode size ext4: properly align shifted xattrs when expanding inodes ext4: fix xattr shifting when expanding inodes part 2 ext4: fix xattr shifting when expanding inodes ext4: validate that metadata blocks do not overlap superblock net: Use ns_capable_noaudit() when determining net sysctl permissions kernel: Add noaudit variant of ns_capable() KEYS: Fix ASN.1 indefinite length object parsing drivers:hv: Lock access to hyperv_mmio resource tree cxlflash: Move to exponential back-off when cmd_room is not available netfilter: x_tables: check for size overflow drm/amdgpu/cz: enable/disable vce dpm even if vce pg is disabled cred: Reject inodes with invalid ids in set_create_file_as() fs: Check for invalid i_uid in may_follow_link() IB/IPoIB: Do not set skb truesize since using one linearskb udp: properly support MSG_PEEK with truncated buffers crypto: nx-842 - Mask XERS0 bit in return value cxlflash: Fix to avoid virtual LUN failover failure cxlflash: Fix to escalate LINK_RESET also on port 1 tipc: fix nl compat regression for link statistics tipc: fix an infoleak in tipc_nl_compat_link_dump netfilter: x_tables: check for size overflow Bluetooth: Add support for Intel Bluetooth device 8265 [8087:0a2b] drm/i915: Check VBT for port presence in addition to the strap on VLV/CHV drm/i915: Only ignore eDP ports that are connected Input: xpad - move pending clear to the correct location net: thunderx: Fix link status reporting x86/hyperv: Avoid reporting bogus NMI status for Gen2 instances crypto: vmx - IV size failing on skcipher API tda10071: Fix dependency to REGMAP_I2C crypto: vmx - Fix ABI detection crypto: vmx - comply with ABIs that specify vrsave as reserved. HID: core: prevent out-of-bound readings lpfc: Fix DMA faults observed upon plugging loopback connector block: fix blk_rq_get_max_sectors for driver private requests irqchip/gicv3-its: numa: Enable workaround for Cavium thunderx erratum 23144 clocksource: Allow unregistering the watchdog btrfs: Continue write in case of can_not_nocow blk-mq: End unstarted requests on dying queue cxlflash: Fix to resolve dead-lock during EEH recovery drm/radeon/mst: fix regression in lane/link handling. ecryptfs: fix handling of directory opening ALSA: hda: add AMD Polaris-10/11 AZ PCI IDs with proper driver caps drm: Balance error path for GEM handle allocation ntp: Fix ADJ_SETOFFSET being used w/ ADJ_NANO time: Verify time values in adjtimex ADJ_SETOFFSET to avoid overflow Input: xpad - correctly handle concurrent LED and FF requests net: thunderx: Fix receive packet stats net: thunderx: Fix for multiqset not configured upon interface toggle perf/x86/cqm: Fix CQM memory leak and notifier leak perf/x86/cqm: Fix CQM handling of grouping events into a cache_group s390/crypto: provide correct file mode at device register. proc: revert /proc/<pid>/maps [stack:TID] annotation intel_idle: Support for Intel Xeon Phi Processor x200 Product Family cxlflash: Fix to avoid unnecessary scan with internal LUNs Drivers: hv: vmbus: don't manipulate with clocksources on crash Drivers: hv: vmbus: avoid scheduling in interrupt context in vmbus_initiate_unload() Drivers: hv: vmbus: avoid infinite loop in init_vp_index() arcmsr: fixes not release allocated resource arcmsr: fixed getting wrong configuration data s390/pci_dma: fix DMA table corruption with > 4 TB main memory net/mlx5e: Don't modify CQ before it was created net/mlx5e: Don't try to modify CQ moderation if it is not supported mmc: sdhci: Do not BUG on invalid vdd UVC: Add support for R200 depth camera sched/numa: Fix use-after-free bug in the task_numa_compare ALSA: hda - add codec support for Kabylake display audio codec drm/i915: Fix hpd live status bits for g4x tipc: fix nullptr crash during subscription cancel arm64: Add workaround for Cavium erratum 27456 net: thunderx: Fix for Qset error due to CQ full drm/radeon: fix dp link rate selection (v2) drm/amdgpu: fix dp link rate selection (v2) qla2xxx: Use ATIO type to send correct tmr response mmc: sdhci: 64-bit DMA actually has 4-byte alignment drm/atomic: Do not unset crtc when an encoder is stolen drm/i915/skl: Add missing SKL ids drm/i915/bxt: update list of PCIIDs hrtimer: Catch illegal clockids i40e/i40evf: Fix RSS rx-flow-hash configuration through ethtool mpt3sas: Fix for Asynchronous completion of timedout IO and task abort of timedout IO. mpt3sas: A correction in unmap_resources net: cavium: liquidio: fix check for in progress flag arm64: KVM: Configure TCR_EL2.PS at runtime irqchip/gic-v3: Make sure read from ICC_IAR1_EL1 is visible on redestributor pwm: lpc32xx: fix and simplify duty cycle and period calculations pwm: lpc32xx: correct number of PWM channels from 2 to 1 pwm: fsl-ftm: Fix clock enable/disable when using PM megaraid_sas: Add an i/o barrier megaraid_sas: Fix SMAP issue megaraid_sas: Do not allow PCI access during OCR s390/cio: update measurement characteristics s390/cio: ensure consistent measurement state s390/cio: fix measurement characteristics memleak qeth: initialize net_device with carrier off lpfc: Fix external loopback failure. lpfc: Fix mbox reuse in PLOGI completion lpfc: Fix RDP Speed reporting. lpfc: Fix crash in fcp command completion path. lpfc: Fix driver crash when module parameter lpfc_fcp_io_channel set to 16 lpfc: Fix RegLogin failed error seen on Lancer FC during port bounce lpfc: Fix the FLOGI discovery logic to comply with T11 standards lpfc: Fix FCF Infinite loop in lpfc_sli4_fcf_rr_next_index_get. cxl: Enable PCI device ID for future IBM CXL adapter cxl: fix build for GCC 4.6.x cxlflash: Enable device id for future IBM CXL adapter cxlflash: Resolve oops in wait_port_offline cxlflash: Fix to resolve cmd leak after host reset cxl: Fix DSI misses when the context owning task exits cxl: Fix possible idr warning when contexts are released Drivers: hv: vmbus: fix rescind-offer handling for device without a driver Drivers: hv: vmbus: serialize process_chn_event() and vmbus_close_internal() Drivers: hv: vss: run only on supported host versions drivers/hv: cleanup synic msrs if vmbus connect failed Drivers: hv: util: catch allocation errors tools: hv: report ENOSPC errors in hv_fcopy_daemon Drivers: hv: utils: run polling callback always in interrupt context Drivers: hv: util: Increase the timeout for util services lightnvm: fix missing grown bad block type lightnvm: fix locking and mempool in rrpc_lun_gc lightnvm: unlock rq and free ppa_list on submission fail lightnvm: add check after mempool allocation lightnvm: fix incorrect nr_free_blocks stat lightnvm: fix bio submission issue cxlflash: a couple off by one bugs fm10k: Cleanup exception handling for mailbox interrupt fm10k: Cleanup MSI-X interrupts in case of failure fm10k: reinitialize queuing scheme after calling init_hw fm10k: always check init_hw for errors fm10k: reset max_queues on init_hw_vf failure fm10k: Fix handling of NAPI budget when multiple queues are enabled per vector fm10k: Correct MTU for jumbo frames fm10k: do not assume VF always has 1 queue clk: xgene: Fix divider with non-zero shift value e1000e: fix division by zero on jumbo MTUs e1000: fix data race between tx_ring->next_to_clean ixgbe: Fix handling of NAPI budget when multiple queues are enabled per vector igb: fix NULL derefs due to skipped SR-IOV enabling igb: use the correct i210 register for EEMNGCTL igb: don't unmap NULL hw_addr i40e: Fix Rx hash reported to the stack by our driver i40e: clean whole mac filter list i40evf: check rings before freeing resources i40e: don't add zero MAC filter i40e: properly delete VF MAC filters i40e: Fix memory leaks, sideband filter programming i40e: fix: do not sleep in netdev_ops i40e/i40evf: Fix RS bit update in Tx path and disable force WB workaround i40evf: handle many MAC filters correctly i40e: Workaround fix for mss < 256 issue UPSTREAM: audit: fix a double fetch in audit_log_single_execve_arg() UPSTREAM: ARM: 8494/1: mm: Enable PXN when running non-LPAE kernel on LPAE processor FIXUP: sched/tune: update accouting before CPU capacity FIXUP: sched/tune: add fixes missing from a previous patch arm: Fix #if/#ifdef typo in topology.c arm: Fix build error "conflicting types for 'scale_cpu_capacity'" sched/walt: use do_div instead of division operator DEBUG: cpufreq: fix cpu_capacity tracing build for non-smp systems sched/walt: include missing header for arm_timer_read_counter() cpufreq: Kconfig: Fixup incorrect selection by CPU_FREQ_DEFAULT_GOV_SCHED sched/fair: Avoid redundant idle_cpu() call in update_sg_lb_stats() FIXUP: sched: scheduler-driven cpu frequency selection sched/rt: Add Kconfig option to enable panicking for RT throttling sched/rt: print RT tasks when RT throttling is activated UPSTREAM: sched: Fix a race between __kthread_bind() and sched_setaffinity() sched/fair: Favor higher cpus only for boosted tasks vmstat: make vmstat_updater deferrable again and shut down on idle sched/fair: call OPP update when going idle after migration sched/cpufreq_sched: fix thermal capping events sched/fair: Picking cpus with low OPPs for tasks that prefer idle CPUs FIXUP: sched/tune: do initialization as a postcore_initicall DEBUG: sched: add tracepoint for RD overutilized sched/tune: Introducing a new schedtune attribute prefer_idle sched: use util instead of capacity to select busy cpu arch_timer: add error handling when the MPM global timer is cleared FIXUP: sched: Fix double-release of spinlock in move_queued_task FIXUP: sched/fair: Fix hang during suspend in sched_group_energy FIXUP: sched: fix SchedFreq integration for both PELT and WALT sched: EAS: Avoid causing spikes to max-freq unnecessarily FIXUP: sched: fix set_cfs_cpu_capacity when WALT is in use sched/walt: Accounting for number of irqs pending on each core sched: Introduce Window Assisted Load Tracking (WALT) sched/tune: fix PB and PC cuts indexes definition sched/fair: optimize idle cpu selection for boosted tasks FIXUP: sched/tune: fix accounting for runnable tasks sched/tune: use a single initialisation function sched/{fair,tune}: simplify fair.c code FIXUP: sched/tune: fix payoff calculation for boost region sched/tune: Add support for negative boost values FIX: sched/tune: move schedtune_nornalize_energy into fair.c FIX: sched/tune: update usage of boosted task utilisation on CPU selection sched/fair: add tunable to set initial task load sched/fair: add tunable to force selection at cpu granularity sched: EAS: take cstate into account when selecting idle core sched/cpufreq_sched: Consolidated update FIXUP: sched: fix build for non-SMP target DEBUG: sched/tune: add tracepoint on P-E space filtering DEBUG: sched/tune: add tracepoint for energy_diff() values DEBUG: sched/tune: add tracepoint for task boost signal arm: topology: Define TC2 energy and provide it to the scheduler CHROMIUM: sched: update the average of nr_running DEBUG: schedtune: add tracepoint for schedtune_tasks_update() values DEBUG: schedtune: add tracepoint for CPU boost signal DEBUG: schedtune: add tracepoint for SchedTune configuration update DEBUG: sched: add energy procfs interface DEBUG: sched,cpufreq: add cpu_capacity change tracepoint DEBUG: sched: add tracepoint for CPU load/util signals DEBUG: sched: add tracepoint for task load/util signals DEBUG: sched: add tracepoint for cpu/freq scale invariance sched/fair: filter energy_diff() based on energy_payoff value sched/tune: add support to compute normalized energy sched/fair: keep track of energy/capacity variations sched/fair: add boosted task utilization sched/{fair,tune}: track RUNNABLE tasks impact on per CPU boost value sched/tune: compute and keep track of per CPU boost value sched/tune: add initial support for CGroups based boosting sched/fair: add boosted CPU usage sched/fair: add function to convert boost value into "margin" sched/tune: add sysctl interface to define a boost value sched/tune: add detailed documentation fixup! sched/fair: jump to max OPP when crossing UP threshold fixup! sched: scheduler-driven cpu frequency selection sched: rt scheduler sets capacity requirement sched: deadline: use deadline bandwidth in scale_rt_capacity sched: remove call of sched_avg_update from sched_rt_avg_update sched/cpufreq_sched: add trace events sched/fair: jump to max OPP when crossing UP threshold sched/fair: cpufreq_sched triggers for load balancing sched/{core,fair}: trigger OPP change request on fork() sched/fair: add triggers for OPP change requests sched: scheduler-driven cpu frequency selection cpufreq: introduce cpufreq_driver_is_slow sched: Consider misfit tasks when load-balancing sched: Add group_misfit_task load-balance type sched: Add per-cpu max capacity to sched_group_capacity sched: Do eas idle balance regardless of the rq avg idle value arm64: Enable max freq invariant scheduler load-tracking and capacity support arm: Enable max freq invariant scheduler load-tracking and capacity support sched: Update max cpu capacity in case of max frequency constraints cpufreq: Max freq invariant scheduler load-tracking and cpu capacity support arm64, topology: Updates to use DT bindings for EAS costing data sched: Support for extracting EAS energy costs from DT Documentation: DT bindings for energy model cost data required by EAS sched: Disable energy-unfriendly nohz kicks sched: Consider a not over-utilized energy-aware system as balanced sched: Energy-aware wake-up task placement sched: Determine the current sched_group idle-state sched, cpuidle: Track cpuidle state index in the scheduler sched: Add over-utilization/tipping point indicator sched: Estimate energy impact of scheduling decisions sched: Extend sched_group_energy to test load-balancing decisions sched: Calculate energy consumption of sched_group sched: Highest energy aware balancing sched_domain level pointer sched: Relocated cpu_util() and change return type sched: Compute cpu capacity available at current frequency arm64: Cpu invariant scheduler load-tracking and capacity support arm: Cpu invariant scheduler load-tracking and capacity support sched: Introduce SD_SHARE_CAP_STATES sched_domain flag sched: Initialize energy data structures sched: Introduce energy data structures sched: Make energy awareness a sched feature sched: Documentation for scheduler energy cost model sched: Prevent unnecessary active balance of single task in sched group sched: Enable idle balance to pull single task towards cpu with higher capacity sched: Consider spare cpu capacity at task wake-up sched: Add cpu capacity awareness to wakeup balancing sched: Store system-wide maximum cpu capacity in root domain arm: Update arch_scale_cpu_capacity() to reflect change to define arm64: Enable frequency invariant scheduler load-tracking support arm: Enable frequency invariant scheduler load-tracking support cpufreq: Frequency invariant scheduler load-tracking support sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task() FROMLIST: pstore: drop pmsg bounce buffer UPSTREAM: usercopy: remove page-spanning test for now UPSTREAM: usercopy: force check_object_size() inline BACKPORT: usercopy: fold builtin_const check into inline function UPSTREAM: x86/uaccess: force copy_*_user() to be inlined UPSTREAM: HID: core: prevent out-of-bound readings Android: Fix build breakages. UPSTREAM: tty: Prevent ldisc drivers from re-using stale tty fields UPSTREAM: netfilter: nfnetlink: correctly validate length of batch messages cpuset: Make cpusets restore on hotplug UPSTREAM: mm/slub: support left redzone UPSTREAM: Make the hardened user-copy code depend on having a hardened allocator Android: MMC/UFS IO Latency Histograms. UPSTREAM: usercopy: fix overlap check for kernel text UPSTREAM: usercopy: avoid potentially undefined behavior in pointer math UPSTREAM: unsafe_[get|put]_user: change interface to use a error target label BACKPORT: arm64: mm: fix location of _etext BACKPORT: ARM: 8583/1: mm: fix location of _etext BACKPORT: Don't show empty tag stats for unprivileged uids UPSTREAM: tcp: fix use after free in tcp_xmit_retransmit_queue() ANDROID: base-cfg: drop SECCOMP_FILTER config UPSTREAM: [media] xc2028: unlock on error in xc2028_set_config() UPSTREAM: [media] xc2028: avoid use after free ANDROID: base-cfg: enable SECCOMP config ANDROID: rcu_sync: Export rcu_sync_lockdep_assert RFC: FROMLIST: cgroup: reduce read locked section of cgroup_threadgroup_rwsem during fork RFC: FROMLIST: cgroup: avoid synchronize_sched() in __cgroup_procs_write() RFC: FROMLIST: locking/percpu-rwsem: Optimize readers and reduce global impact net: ipv6: Fix ping to link-local addresses. ipv6: fix endianness error in icmpv6_err ANDROID: dm: android-verity: Allow android-verity to be compiled as an independent module backporting: a brief introduce of backported feautures on 4.4 Linux 4.4.20 sysfs: correctly handle read offset on PREALLOC attrs hwmon: (iio_hwmon) fix memory leak in name attribute ALSA: line6: Fix POD sysfs attributes segfault ALSA: line6: Give up on the lock while URBs are released. ALSA: line6: Remove double line6_pcm_release() after failed acquire. ACPI / SRAT: fix SRAT parsing order with both LAPIC and X2APIC present ACPI / sysfs: fix error code in get_status() ACPI / drivers: replace acpi_probe_lock spinlock with mutex ACPI / drivers: fix typo in ACPI_DECLARE_PROBE_ENTRY macro staging: comedi: ni_mio_common: fix wrong insn_write handler staging: comedi: ni_mio_common: fix AO inttrig backwards compatibility staging: comedi: comedi_test: fix timer race conditions staging: comedi: daqboard2000: bug fix board type matching code USB: serial: option: add WeTelecom 0x6802 and 0x6803 products USB: serial: option: add WeTelecom WM-D200 USB: serial: mos7840: fix non-atomic allocation in write path USB: serial: mos7720: fix non-atomic allocation in write path USB: fix typo in wMaxPacketSize validation usb: chipidea: udc: don't touch DP when controller is in host mode USB: avoid left shift by -1 dmaengine: usb-dmac: check CHCR.DE bit in usb_dmac_isr_channel() crypto: qat - fix aes-xts key sizes crypto: nx - off by one bug in nx_of_update_msc() Input: i8042 - set up shared ps2_cmd_mutex for AUX ports Input: i8042 - break load dependency between atkbd/psmouse and i8042 Input: tegra-kbc - fix inverted reset logic btrfs: properly track when rescan worker is running btrfs: waiting on qgroup rescan should not always be interruptible fs/seq_file: fix out-of-bounds read gpio: Fix OF build problem on UM usb: renesas_usbhs: gadget: fix return value check in usbhs_mod_gadget_probe() megaraid_sas: Fix probing cards without io port mpt3sas: Fix resume on WarpDrive flash cards cdc-acm: fix wrong pipe type on rx interrupt xfers i2c: cros-ec-tunnel: Fix usage of cros_ec_cmd_xfer() mfd: cros_ec: Add cros_ec_cmd_xfer_status() helper aacraid: Check size values after double-fetch from user ARC: Elide redundant setup of DMA callbacks ARC: Call trace_hardirqs_on() before enabling irqs ARC: use correct offset in pt_regs for saving/restoring user mode r25 ARC: build: Better way to detect ISA compatible toolchain drm/i915: fix aliasing_ppgtt leak drm/amdgpu: record error code when ring test failed drm/amd/amdgpu: sdma resume fail during S4 on CI drm/amdgpu: skip TV/CV in display parsing drm/amdgpu: avoid a possible array overflow drm/amdgpu: fix amdgpu_move_blit on 32bit systems drm/amdgpu: Change GART offset to 64-bit iio: fix sched WARNING "do not call blocking ops when !TASK_RUNNING" sched/nohz: Fix affine unpinned timers mess sched/cputime: Fix NO_HZ_FULL getrusage() monotonicity regression of: fix reference counting in of_graph_get_endpoint_by_regs arm64: dts: rockchip: add reset saradc node for rk3368 SoCs mac80211: fix purging multicast PS buffer queue s390/dasd: fix hanging device after clear subchannel EDAC: Increment correct counter in edac_inc_ue_error() pinctrl/amd: Remove the default de-bounce time iommu/arm-smmu: Don't BUG() if we find aborting STEs with disable_bypass iommu/arm-smmu: Fix CMDQ error handling iommu/dma: Don't put uninitialised IOVA domains xhci: Make sure xhci handles USB_SPEED_SUPER_PLUS devices. USB: serial: ftdi_sio: add PIDs for Ivium Technologies devices USB: serial: ftdi_sio: add device ID for WICED USB UART dev board USB: serial: option: add support for Telit LE920A4 USB: serial: option: add D-Link DWM-156/A3 USB: serial: fix memleak in driver-registration error path xhci: don't dereference a xhci member after removing xhci usb: xhci: Fix panic if disconnect xhci: always handle "Command Ring Stopped" events usb/gadget: fix gadgetfs aio support. usb: gadget: fsl_qe_udc: off by one in setup_received_handle() USB: validate wMaxPacketValue entries in endpoint descriptors usb: renesas_usbhs: Use dmac only if the pipe type is bulk usb: renesas_usbhs: clear the BRDYSTS in usbhsg_ep_enable() USB: hub: change the locking in hub_activate USB: hub: fix up early-exit pathway in hub_activate usb: hub: Fix unbalanced reference count/memory leak/deadlocks usb: define USB_SPEED_SUPER_PLUS speed for SuperSpeedPlus USB3.1 devices usb: dwc3: gadget: increment request->actual once usb: dwc3: pci: add Intel Kabylake PCI ID usb: misc: usbtest: add fix for driver hang usb: ehci: change order of register cleanup during shutdown crypto: caam - defer aead_set_sh_desc in case of zero authsize crypto: caam - fix echainiv(authenc) encrypt shared descriptor crypto: caam - fix non-hmac hashes genirq/msi: Make sure PCI MSIs are activated early genirq/msi: Remove unused MSI_FLAG_IDENTITY_MAP um: Don't discard .text.exit section ACPI / CPPC: Prevent cpc_desc_ptr points to the invalid data ACPI: CPPC: Return error if _CPC is invalid on a CPU mmc: sdhci-acpi: Reduce Baytrail eMMC/SD/SDIO hangs PCI: Limit config space size for Netronome NFP4000 PCI: Add Netronome NFP4000 PF device ID PCI: Limit config space size for Netronome NFP6000 family PCI: Add Netronome vendor and device IDs PCI: Support PCIe devices with short cfg_size NVMe: Don't unmap controller registers on reset ALSA: hda - Manage power well properly for resume libnvdimm, nd_blk: mask off reserved status bits perf intel-pt: Fix occasional decoding errors when tracing system-wide vfio/pci: Fix NULL pointer oops in error interrupt setup handling virtio: fix memory leak in virtqueue_add() parisc: Fix order of EREFUSED define in errno.h arm64: Define AT_VECTOR_SIZE_ARCH for ARCH_DLINFO ALSA: usb-audio: Add quirk for ELP HD USB Camera ALSA: usb-audio: Add a sample rate quirk for Creative Live! Cam Socialize HD (VF0610) powerpc/eeh: eeh_pci_enable(): fix checking of post-request state SUNRPC: allow for upcalls for same uid but different gss service SUNRPC: Handle EADDRNOTAVAIL on connection failures tools/testing/nvdimm: fix SIGTERM vs hotplug crash uprobes/x86: Fix RIP-relative handling of EVEX-encoded instructions x86/mm: Disable preemption during CR3 read+write hugetlb: fix nr_pmds accounting with shared page tables mm: SLUB hardened usercopy support mm: SLAB hardened usercopy support s390/uaccess: Enable hardened usercopy sparc/uaccess: Enable hardened usercopy powerpc/uaccess: Enable hardened usercopy ia64/uaccess: Enable hardened usercopy arm64/uaccess: Enable hardened usercopy ARM: uaccess: Enable hardened usercopy x86/uaccess: Enable hardened usercopy x86: remove more uaccess_32.h complexity x86: remove pointless uaccess_32.h complexity x86: fix SMAP in 32-bit environments Use the new batched user accesses in generic user string handling Add 'unsafe' user access functions for batched accesses x86: reorganize SMAP handling in user space accesses mm: Hardened usercopy mm: Implement stack frame object validation mm: Add is_migrate_cma_page Linux 4.4.19 Documentation/module-signing.txt: Note need for version info if reusing a key module: Invalidate signatures on force-loaded modules dm flakey: error READ bios during the down_interval rtc: s3c: Add s3c_rtc_{enable/disable}_clk in s3c_rtc_setfreq() lpfc: fix oops in lpfc_sli4_scmd_to_wqidx_distr() from lpfc_send_taskmgmt() ACPI / EC: Work around method reentrancy limit in ACPICA for _Qxx x86/platform/intel_mid_pci: Rework IRQ0 workaround PCI: Mark Atheros AR9485 and QCA9882 to avoid bus reset MIPS: hpet: Increase HPET_MIN_PROG_DELTA and decrease HPET_MIN_CYCLES MIPS: Don't register r4k sched clock when CPUFREQ enabled MIPS: mm: Fix definition of R6 cache instruction SUNRPC: Don't allocate a full sockaddr_storage for tracing Input: elan_i2c - properly wake up touchpad on ASUS laptops target: Fix ordered task CHECK_CONDITION early exception handling target: Fix max_unmap_lba_count calc overflow target: Fix race between iscsi-target connection shutdown + ABORT_TASK target: Fix missing complete during ABORT_TASK + CMD_T_FABRIC_STOP target: Fix ordered task target_setup_cmd_from_cdb exception hang iscsi-target: Fix panic when adding second TCP connection to iSCSI session ubi: Fix race condition between ubi device creation and udev ubi: Fix early logging ubi: Make volume resize power cut aware of: fix memory leak related to safe_name() IB/mlx4: Fix memory leak if QP creation failed IB/mlx4: Fix error flow when sending mads under SRIOV IB/mlx4: Fix the SQ size of an RC QP IB/IWPM: Fix a potential skb leak IB/IPoIB: Don't update neigh validity for unresolved entries IB/SA: Use correct free function IB/mlx5: Return PORT_ERR in Active to Initializing tranisition IB/mlx5: Fix post send fence logic IB/mlx5: Fix entries check in mlx5_ib_resize_cq IB/mlx5: Fix returned values of query QP IB/mlx5: Fix entries checks in mlx5_ib_create_cq IB/mlx5: Fix MODIFY_QP command input structure ALSA: hda - Fix headset mic detection problem for two dell machines ALSA: hda: add AMD Bonaire AZ PCI ID with proper driver caps ALSA: hda/realtek - Can't adjust speaker's volume on a Dell AIO ALSA: hda: Fix krealloc() with __GFP_ZERO usage mm/hugetlb: avoid soft lockup in set_max_huge_pages() mtd: nand: fix bug writing 1 byte less than page size block: fix bdi vs gendisk lifetime mismatch block: add missing group association in bio-cloning functions metag: Fix __cmpxchg_u32 asm constraint for CMP ftrace/recordmcount: Work around for addition of metag magic but not relocations balloon: check the number of available pages in leak balloon drm/i915/dp: Revert "drm/i915/dp: fall back to 18 bpp when sink capability is unknown" drm/i915: Never fully mask the the EI up rps interrupt on SNB/IVB drm/edid: Add 6 bpc quirk for display AEO model 0. drm: Restore double clflush on the last partial cacheline drm/nouveau/fbcon: fix font width not divisible by 8 drm/nouveau/gr/nv3x: fix instobj write offsets in gr setup drm/nouveau: check for supported chipset before booting fbdev off the hw drm/radeon: support backlight control for UNIPHY3 drm/radeon: fix firmware info version checks drm/radeon: Poll for both connect/disconnect on analog connectors drm/radeon: add a delay after ATPX dGPU power off drm/amdgpu/gmc7: add missing mullins case drm/amdgpu: fix firmware info version checks drm/amdgpu: Disable RPM helpers while reprobing connectors on resume drm/amdgpu: support backlight control for UNIPHY3 drm/amdgpu: Poll for both connect/disconnect on analog connectors drm/amdgpu: add a delay after ATPX dGPU power off w1:omap_hdq: fix regression netlabel: add address family checks to netlbl_{sock,req}_delattr() ARM: dts: sunxi: Add a startup delay for fixed regulator enabled phys audit: fix a double fetch in audit_log_single_execve_arg() iommu/amd: Update Alias-DTE in update_device_table() iommu/amd: Init unity mappings only for dma_ops domains iommu/amd: Handle IOMMU_DOMAIN_DMA in ops->domain_free call-back iommu/vt-d: Return error code in domain_context_mapping_one() iommu/exynos: Suppress unbinding to prevent system failure drm/i915: Don't complain about lack of ACPI video bios nfsd: don't return an unhashed lock stateid after taking mutex nfsd: Fix race between FREE_STATEID and LOCK nfs: don't create zero-length requests MIPS: KVM: Propagate kseg0/mapped tlb fault errors MIPS: KVM: Fix gfn range check in kseg0 tlb faults MIPS: KVM: Add missing gfn range check MIPS: KVM: Fix mapped fault broken commpage handling random: add interrupt callback to VMBus IRQ handler random: print a warning for the first ten uninitialized random users random: initialize the non-blocking pool via add_hwgenerator_randomness() CIFS: Fix a possible invalid memory access in smb2_query_symlink() cifs: fix crash due to race in hmac(md5) handling cifs: Check for existing directory when opening file with O_CREAT fs/cifs: make share unaccessible at root level mountable jbd2: make journal y2038 safe ARC: mm: don't loose PTE_SPECIAL in pte_modify() remoteproc: Fix potential race condition in rproc_add ovl: disallow overlayfs as upperdir HID: uhid: fix timeout when probe races with IO EDAC: Correct channel count limit Bluetooth: Fix l2cap_sock_setsockopt() with optname BT_RCVMTU spi: pxa2xx: Clear all RFT bits in reset_sccr1() on Intel Quark i2c: efm32: fix a failure path in efm32_i2c_probe() s5p-mfc: Add release callback for memory region devs s5p-mfc: Set device name for reserved memory region devs hp-wmi: Fix wifi cannot be hard-unblocked dm: set DMF_SUSPENDED* _before_ clearing DMF_NOFLUSH_SUSPENDING sur40: fix occasional oopses on device close sur40: lower poll interval to fix occasional FPS drops to ~56 FPS Fix RC5 decoding with Fintek CIR chipset vb2: core: Skip planes array verification if pb is NULL videobuf2-v4l2: Verify planes array in buffer dequeueing media: dvb_ringbuffer: Add memory barriers media: usbtv: prevent access to free'd resources mfd: qcom_rpm: Parametrize also ack selector size mfd: qcom_rpm: Fix offset error for msm8660 intel_pstate: Fix MSR_CONFIG_TDP_x addressing in core_get_max_pstate() s390/cio: allow to reset channel measurement block KVM: nVMX: Fix memory corruption when using VMCS shadowing KVM: VMX: handle PML full VMEXIT that occurs during event delivery KVM: MTRR: fix kvm_mtrr_check_gfn_range_consistency page fault KVM: PPC: Book3S HV: Save/restore TM state in H_CEDE KVM: PPC: Book3S HV: Pull out TM state save/restore into separate procedures arm64: mm: avoid fdt_check_header() before the FDT is fully mapped arm64: dts: rockchip: fixes the gic400 2nd region size for rk3368 pinctrl: cherryview: prevent concurrent access to GPIO controllers Bluetooth: hci_intel: Fix null gpio desc pointer dereference gpio: intel-mid: Remove potentially harmful code gpio: pca953x: Fix NBANK calculation for PCA9536 tty/serial: atmel: fix RS485 half duplex with DMA serial: samsung: Fix ERR pointer dereference on deferred probe tty: serial: msm: Don't read off end of tx fifo arm64: Fix incorrect per-cpu usage for boot CPU arm64: debug: unmask PSTATE.D earlier arm64: kernel: Save and restore UAO and addr_limit on exception entry USB: usbfs: fix potential infoleak in devio usb: renesas_usbhs: fix NULL pointer dereference in xfer_work() USB: serial: option: add support for Telit LE910 PID 0x1206 usb: dwc3: fix for the isoc transfer EP_BUSY flag usb: quirks: Add no-lpm quirk for Elan usb: renesas_usbhs: protect the CFIFOSEL setting in usbhsg_ep_enable() usb: f_fs: off by one bug in _ffs_func_bind() usb: gadget: avoid exposing kernel stack UPSTREAM: usb: gadget: configfs: add mutex lock before unregister gadget ANDROID: dm-verity: adopt changes made to dm callbacks UPSTREAM: ecryptfs: fix handling of directory opening ANDROID: net: core: fix UID-based routing ANDROID: net: fib: remove duplicate assignment FROMLIST: proc: Fix timerslack_ns CAP_SYS_NICE check when adjusting self ANDROID: dm verity fec: pack the fec_header structure ANDROID: dm: android-verity: Verify header before fetching table ANDROID: dm: allow adb disable-verity only in userdebug ANDROID: dm: mount as linear target if eng build ANDROID: dm: use default verity public key ANDROID: dm: fix signature verification flag ANDROID: dm: use name_to_dev_t ANDROID: dm: rename dm-linear methods for dm-android-verity ANDROID: dm: Minor cleanup ANDROID: dm: Mounting root as linear device when verity disabled ANDROID: dm-android-verity: Rebase on top of 4.1 ANDROID: dm: Add android verity target ANDROID: dm: fix dm_substitute_devices() ANDROID: dm: Rebase on top of 4.1 CHROMIUM: dm: boot time specification of dm= Implement memory_state_time, used by qcom,cpubw Revert "panic: Add board ID to panic output" usb: gadget: f_accessory: remove duplicate endpoint alloc BACKPORT: brcmfmac: defer DPC processing during probe FROMLIST: proc: Add LSM hook checks to /proc/<tid>/timerslack_ns FROMLIST: proc: Relax /proc/<tid>/timerslack_ns capability requirements UPSTREAM: ppp: defer netns reference release for ppp channel cpuset: Add allow_attach hook for cpusets on android. UPSTREAM: KEYS: Fix ASN.1 indefinite length object parsing ANDROID: sdcardfs: fix itnull.cocci warnings android-recommended.cfg: enable fstack-protector-strong Linux 4.4.18 mm: memcontrol: fix memcg id ref counter on swap charge move mm: memcontrol: fix swap counter leak on swapout from offline cgroup mm: memcontrol: fix cgroup creation failure after many small jobs ext4: fix reference counting bug on block allocation error ext4: short-cut orphan cleanup on error ext4: validate s_reserved_gdt_blocks on mount ext4: don't call ext4_should_journal_data() on the journal inode ext4: fix deadlock during page writeback ext4: check for extents that wrap around crypto: scatterwalk - Fix test in scatterwalk_done crypto: gcm - Filter out async ghash if necessary fs/dcache.c: avoid soft-lockup in dput() fuse: fix wrong assignment of ->flags in fuse_send_init() fuse: fuse_flush must check mapping->flags for errors fuse: fsync() did not return IO errors sysv, ipc: fix security-layer leaking block: fix use-after-free in seq file x86/syscalls/64: Add compat_sys_keyctl for 32-bit userspace drm/i915: Pretend cursor is always on for ILK-style WM calculations (v2) x86/mm/pat: Fix BUG_ON() in mmap_mem() on QEMU/i386 x86/pat: Document the PAT initialization sequence x86/xen, pat: Remove PAT table init code from Xen x86/mtrr: Fix PAT init handling when MTRR is disabled x86/mtrr: Fix Xorg crashes in Qemu sessions x86/mm/pat: Replace cpu_has_pat with boot_cpu_has() x86/mm/pat: Add pat_disable() interface x86/mm/pat: Add support of non-default PAT MSR setting devpts: clean up interface to pty drivers random: strengthen input validation for RNDADDTOENTCNT apparmor: fix ref count leak when profile sha1 hash is read Revert "s390/kdump: Clear subchannel ID to signal non-CCW/SCSI IPL" KEYS: 64-bit MIPS needs to use compat_sys_keyctl for 32-bit userspace arm: oabi compat: add missing access checks cdc_ncm: do not call usbnet_link_change from cdc_ncm_bind i2c: i801: Allow ACPI SystemIO OpRegion to conflict with PCI BAR x86/mm/32: Enable full randomization on i386 and X86_32 HID: sony: do not bail out when the sixaxis refuses the output report PNP: Add Broadwell to Intel MCH size workaround PNP: Add Haswell-ULT to Intel MCH size workaround scsi: ignore errors from scsi_dh_add_device() ipath: Restrict use of the write() interface tcp: consider recv buf for the initial window scale qed: Fix setting/clearing bit in completion bitmap net/irda: fix NULL pointer dereference on memory allocation failure net: bgmac: Fix infinite loop in bgmac_dma_tx_add() bonding: set carrier off for devices created through netlink ipv4: reject RTNH_F_DEAD and RTNH_F_LINKDOWN from user space tcp: enable per-socket rate limiting of all 'challenge acks' tcp: make challenge acks less predictable arm64: relocatable: suppress R_AARCH64_ABS64 relocations in vmlinux arm64: vmlinux.lds: make __rela_offset and __dynsym_offset ABSOLUTE Linux 4.4.17 vfs: fix deadlock in file_remove_privs() on overlayfs intel_th: Fix a deadlock in modprobing intel_th: pci: Add Kaby Lake PCH-H support net: mvneta: set real interrupt per packet for tx_done libceph: apply new_state before new_up_client on incrementals libata: LITE-ON CX1-JB256-HP needs lower max_sectors i2c: mux: reg: wrong condition checked for of_address_to_resource return value posix_cpu_timer: Exit early when process has been reaped media: fix airspy usb probe error path ipr: Clear interrupt on croc/crocodile when running with LSI SCSI: fix new bug in scsi_dev_info_list string matching RDS: fix rds_tcp_init() error path can: fix oops caused by wrong rtnl dellink usage can: fix handling of unmodifiable configuration options fix can: c_can: Update D_CAN TX and RX functions to 32 bit - fix Altera Cyclone access can: at91_can: RX queue could get stuck at high bus load perf/x86: fix PEBS issues on Intel Atom/Core2 ovl: handle ATTR_KILL* sched/fair: Fix effective_load() to consistently use smoothed load mmc: block: fix packed command header endianness block: fix use-after-free in sys_ioprio_get() qeth: delete napi struct when removing a qeth device platform/chrome: cros_ec_dev - double fetch bug in ioctl clk: rockchip: initialize flags of clk_init_data in mmc-phase clock spi: sun4i: fix FIFO limit spi: sunxi: fix transfer timeout namespace: update event counter when umounting a deleted dentry 9p: use file_dentry() ext4: verify extent header depth ecryptfs: don't allow mmap when the lower fs doesn't support it Revert "ecryptfs: forbid opening files without mmap handler" locks: use file_inode() power_supply: power_supply_read_temp only if use_cnt > 0 cgroup: set css->id to -1 during init pinctrl: imx: Do not treat a PIN without MUX register as an error pinctrl: single: Fix missing flush of posted write for a wakeirq pvclock: Add CPU barriers to get correct version value Input: tsc200x - report proper input_dev name Input: xpad - validate USB endpoint count during probe Input: wacom_w8001 - w8001_MAX_LENGTH should be 13 Input: xpad - fix oops when attaching an unknown Xbox One gamepad Input: elantech - add more IC body types to the list Input: vmmouse - remove port reservation ALSA: timer: Fix leak in events via snd_timer_user_tinterrupt ALSA: timer: Fix leak in events via snd_timer_user_ccallback ALSA: timer: Fix leak in SNDRV_TIMER_IOCTL_PARAMS xenbus: don't bail early from xenbus_dev_request_and_reply() xenbus: don't BUG() on user mode induced condition xen/pciback: Fix conf_space read/write overlap check. ARC: unwind: ensure that .debug_frame is generated (vs. .eh_frame) arc: unwind: warn only once if DW2_UNWIND is disabled kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while processing sysrq-w pps: do not crash when failed to register vmlinux.lds: account for destructor sections mm, meminit: ensure node is online before checking whether pages are uninitialised mm, meminit: always return a valid node from early_pfn_to_nid mm, compaction: prevent VM_BUG_ON when terminating freeing scanner fs/nilfs2: fix potential underflow in call to crc32_le mm, compaction: abort free scanner if split fails mm, sl[au]b: add __GFP_ATOMIC to the GFP reclaim mask dmaengine: at_xdmac: double FIFO flush needed to compute residue dmaengine: at_xdmac: fix residue corruption dmaengine: at_xdmac: align descriptors on 64 bits x86/quirks: Add early quirk to reset Apple AirPort card x86/quirks: Reintroduce scanning of secondary buses x86/quirks: Apply nvidia_bugs quirk only on root bus USB: OHCI: Don't mark EDs as ED_OPER if scheduling fails Conflicts: arch/arm/kernel/topology.c arch/arm64/include/asm/arch_gicv3.h arch/arm64/kernel/topology.c block/bio.c drivers/cpufreq/Kconfig drivers/md/Makefile drivers/media/dvb-core/dvb_ringbuffer.c drivers/media/tuners/tuner-xc2028.c drivers/misc/Kconfig drivers/misc/Makefile drivers/mmc/core/host.c drivers/scsi/ufs/ufshcd.c drivers/scsi/ufs/ufshcd.h drivers/usb/dwc3/gadget.c drivers/usb/gadget/configfs.c fs/ecryptfs/file.c include/linux/mmc/core.h include/linux/mmc/host.h include/linux/mmzone.h include/linux/sched.h include/linux/sched/sysctl.h include/trace/events/power.h include/trace/events/sched.h init/Kconfig kernel/cpuset.c kernel/exit.c kernel/sched/Makefile kernel/sched/core.c kernel/sched/cputime.c kernel/sched/fair.c kernel/sched/features.h kernel/sched/rt.c kernel/sched/sched.h kernel/sched/stop_task.c kernel/sched/tune.c lib/Kconfig.debug mm/Makefile mm/vmstat.c Change-Id: I243a43231ca56a6362076fa6301827e1b0493be5 Signed-off-by: Runmin Wang <runminw@codeaurora.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile3
-rw-r--r--kernel/sched/core.c312
-rw-r--r--kernel/sched/cpufreq_sched.c499
-rw-r--r--kernel/sched/cputime.c15
-rw-r--r--kernel/sched/deadline.c33
-rw-r--r--kernel/sched/energy.c134
-rw-r--r--kernel/sched/fair.c1359
-rw-r--r--kernel/sched/features.h6
-rw-r--r--kernel/sched/idle.c3
-rw-r--r--kernel/sched/rt.c49
-rw-r--r--kernel/sched/sched.h228
-rw-r--r--kernel/sched/tune.c756
-rw-r--r--kernel/sched/tune.h55
-rw-r--r--kernel/sched/walt.c1171
-rw-r--r--kernel/sched/walt.h62
15 files changed, 4526 insertions, 159 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7c0382a3eace..308f80ce2e43 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -14,7 +14,7 @@ endif
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
obj-y += wait.o completion.o idle.o sched_avg.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
obj-$(CONFIG_SCHED_HMP) += hmp.o boost.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
@@ -22,3 +22,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_SCHED_TUNE) += tune.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o
+obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ee708909dc17..b8436a760841 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -291,6 +291,18 @@ int sysctl_sched_rt_runtime = 950000;
/* cpus with isolated domains */
cpumask_var_t cpu_isolated_map;
+struct rq *
+lock_rq_of(struct task_struct *p, unsigned long *flags)
+{
+ return task_rq_lock(p, flags);
+}
+
+void
+unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags)
+{
+ task_rq_unlock(rq, p, flags);
+}
+
/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
@@ -631,7 +643,10 @@ int get_nohz_timer_target(void)
rcu_read_lock();
for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd)) {
- if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
+ if (cpu == i)
+ continue;
+
+ if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
cpu = i;
goto unlock;
}
@@ -2580,7 +2595,7 @@ void wake_up_new_task(struct task_struct *p)
rq = __task_rq_lock(p);
mark_task_starting(p);
- activate_task(rq, p, 0);
+ activate_task(rq, p, ENQUEUE_WAKEUP_NEW);
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
@@ -2961,6 +2976,36 @@ unsigned long nr_iowait_cpu(int cpu)
return atomic_read(&this->nr_iowait);
}
+#ifdef CONFIG_CPU_QUIET
+u64 nr_running_integral(unsigned int cpu)
+{
+ unsigned int seqcnt;
+ u64 integral;
+ struct rq *q;
+
+ if (cpu >= nr_cpu_ids)
+ return 0;
+
+ q = cpu_rq(cpu);
+
+ /*
+ * Update average to avoid reading stalled value if there were
+ * no run-queue changes for a long time. On the other hand if
+ * the changes are happening right now, just read current value
+ * directly.
+ */
+
+ seqcnt = read_seqcount_begin(&q->ave_seqcnt);
+ integral = do_nr_running_integral(q);
+ if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) {
+ read_seqcount_begin(&q->ave_seqcnt);
+ integral = q->nr_running_integral;
+ }
+
+ return integral;
+}
+#endif
+
void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
{
struct rq *rq = this_rq();
@@ -3051,6 +3096,93 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED
+
+static inline
+unsigned long add_capacity_margin(unsigned long cpu_capacity)
+{
+ cpu_capacity = cpu_capacity * capacity_margin;
+ cpu_capacity /= SCHED_CAPACITY_SCALE;
+ return cpu_capacity;
+}
+
+static inline
+unsigned long sum_capacity_reqs(unsigned long cfs_cap,
+ struct sched_capacity_reqs *scr)
+{
+ unsigned long total = add_capacity_margin(cfs_cap + scr->rt);
+ return total += scr->dl;
+}
+
+static void sched_freq_tick_pelt(int cpu)
+{
+ unsigned long cpu_utilization = capacity_max;
+ unsigned long capacity_curr = capacity_curr_of(cpu);
+ struct sched_capacity_reqs *scr;
+
+ scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
+ if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr)
+ return;
+
+ /*
+ * To make free room for a task that is building up its "real"
+ * utilization and to harm its performance the least, request
+ * a jump to a higher OPP as soon as the margin of free capacity
+ * is impacted (specified by capacity_margin).
+ */
+ set_cfs_cpu_capacity(cpu, true, cpu_utilization);
+}
+
+#ifdef CONFIG_SCHED_WALT
+static void sched_freq_tick_walt(int cpu)
+{
+ unsigned long cpu_utilization = cpu_util(cpu);
+ unsigned long capacity_curr = capacity_curr_of(cpu);
+
+ if (walt_disabled || !sysctl_sched_use_walt_cpu_util)
+ return sched_freq_tick_pelt(cpu);
+
+ /*
+ * Add a margin to the WALT utilization.
+ * NOTE: WALT tracks a single CPU signal for all the scheduling
+ * classes, thus this margin is going to be added to the DL class as
+ * well, which is something we do not do in sched_freq_tick_pelt case.
+ */
+ cpu_utilization = add_capacity_margin(cpu_utilization);
+ if (cpu_utilization <= capacity_curr)
+ return;
+
+ /*
+ * It is likely that the load is growing so we
+ * keep the added margin in our request as an
+ * extra boost.
+ */
+ set_cfs_cpu_capacity(cpu, true, cpu_utilization);
+
+}
+#define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu)
+#else
+#define _sched_freq_tick(cpu) sched_freq_tick_pelt(cpu)
+#endif /* CONFIG_SCHED_WALT */
+
+static void sched_freq_tick(int cpu)
+{
+ unsigned long capacity_orig, capacity_curr;
+
+ if (!sched_freq())
+ return;
+
+ capacity_orig = capacity_orig_of(cpu);
+ capacity_curr = capacity_curr_of(cpu);
+ if (capacity_curr == capacity_orig)
+ return;
+
+ _sched_freq_tick(cpu);
+}
+#else
+static inline void sched_freq_tick(int cpu) { }
+#endif /* CONFIG_CPU_FREQ_GOV_SCHED */
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -3077,6 +3209,7 @@ void scheduler_tick(void)
wallclock = sched_ktime_clock();
update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
early_notif = early_detection_notify(rq, wallclock);
+ sched_freq_tick(cpu);
raw_spin_unlock(&rq->lock);
if (early_notif)
@@ -5193,15 +5326,19 @@ void show_state_filter(unsigned long state_filter)
/*
* reset the NMI-timeout, listing all files on a slow
* console might take a lot of time:
+ * Also, reset softlockup watchdogs on all CPUs, because
+ * another CPU might be blocked waiting for us to process
+ * an IPI.
*/
touch_nmi_watchdog();
+ touch_all_softlockup_watchdogs();
if (!state_filter || (p->state & state_filter))
sched_show_task(p);
}
touch_all_softlockup_watchdogs();
-#ifdef CONFIG_SYSRQ_SCHED_DEBUG
+#ifdef CONFIG_SCHED_DEBUG
sysrq_sched_debug_show();
#endif
rcu_read_unlock();
@@ -5231,14 +5368,14 @@ void init_idle(struct task_struct *idle, int cpu, bool cpu_up)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- __sched_fork(0, idle);
-
if (!cpu_up)
init_new_task_load(idle, true);
raw_spin_lock_irqsave(&idle->pi_lock, flags);
raw_spin_lock(&rq->lock);
+ __sched_fork(0, idle);
+
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
@@ -5911,9 +6048,60 @@ set_table_entry(struct ctl_table *entry,
}
static struct ctl_table *
+sd_alloc_ctl_energy_table(struct sched_group_energy *sge)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(5);
+
+ if (table == NULL)
+ return NULL;
+
+ set_table_entry(&table[0], "nr_idle_states", &sge->nr_idle_states,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[1], "idle_states", &sge->idle_states[0].power,
+ sge->nr_idle_states*sizeof(struct idle_state), 0644,
+ proc_doulongvec_minmax, false);
+ set_table_entry(&table[2], "nr_cap_states", &sge->nr_cap_states,
+ sizeof(int), 0644, proc_dointvec_minmax, false);
+ set_table_entry(&table[3], "cap_states", &sge->cap_states[0].cap,
+ sge->nr_cap_states*sizeof(struct capacity_state), 0644,
+ proc_doulongvec_minmax, false);
+
+ return table;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_group_table(struct sched_group *sg)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(2);
+
+ if (table == NULL)
+ return NULL;
+
+ table->procname = kstrdup("energy", GFP_KERNEL);
+ table->mode = 0555;
+ table->child = sd_alloc_ctl_energy_table((struct sched_group_energy *)sg->sge);
+
+ return table;
+}
+
+static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
- struct ctl_table *table = sd_alloc_ctl_entry(14);
+ struct ctl_table *table;
+ unsigned int nr_entries = 14;
+
+ int i = 0;
+ struct sched_group *sg = sd->groups;
+
+ if (sg->sge) {
+ int nr_sgs = 0;
+
+ do {} while (nr_sgs++, sg = sg->next, sg != sd->groups);
+
+ nr_entries += nr_sgs;
+ }
+
+ table = sd_alloc_ctl_entry(nr_entries);
if (table == NULL)
return NULL;
@@ -5946,7 +6134,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name,
CORENAME_MAX_SIZE, 0444, proc_dostring, false);
- /* &table[13] is terminator */
+ sg = sd->groups;
+ if (sg->sge) {
+ char buf[32];
+ struct ctl_table *entry = &table[13];
+
+ do {
+ snprintf(buf, 32, "group%d", i);
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0555;
+ entry->child = sd_alloc_ctl_group_table(sg);
+ } while (entry++, i++, sg = sg->next, sg != sd->groups);
+ }
+ /* &table[nr_entries-1] is terminator */
return table;
}
@@ -6259,7 +6459,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
printk(KERN_CONT " %*pbl",
cpumask_pr_args(sched_group_cpus(group)));
if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
- printk(KERN_CONT " (cpu_capacity = %d)",
+ printk(KERN_CONT " (cpu_capacity = %lu)",
group->sgc->capacity);
}
@@ -6320,7 +6520,8 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_EXEC |
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
- SD_SHARE_POWERDOMAIN)) {
+ SD_SHARE_POWERDOMAIN |
+ SD_SHARE_CAP_STATES)) {
if (sd->groups != sd->groups->next)
return 0;
}
@@ -6352,7 +6553,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING |
- SD_SHARE_POWERDOMAIN);
+ SD_SHARE_POWERDOMAIN |
+ SD_SHARE_CAP_STATES);
if (nr_node_ids == 1)
pflags &= ~SD_SERIALIZE;
}
@@ -6431,6 +6633,8 @@ static int init_rootdomain(struct root_domain *rd)
if (cpupri_init(&rd->cpupri) != 0)
goto free_rto_mask;
+
+ init_max_cpu_capacity(&rd->max_cpu_capacity);
return 0;
free_rto_mask:
@@ -6536,11 +6740,13 @@ DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
DEFINE_PER_CPU(struct sched_domain *, sd_busy);
DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+DEFINE_PER_CPU(struct sched_domain *, sd_ea);
+DEFINE_PER_CPU(struct sched_domain *, sd_scs);
static void update_top_cache_domain(int cpu)
{
struct sched_domain *sd;
- struct sched_domain *busy_sd = NULL;
+ struct sched_domain *busy_sd = NULL, *ea_sd = NULL;
int id = cpu;
int size = 1;
@@ -6561,6 +6767,17 @@ static void update_top_cache_domain(int cpu)
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+
+ for_each_domain(cpu, sd) {
+ if (sd->groups->sge)
+ ea_sd = sd;
+ else
+ break;
+ }
+ rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);
+
+ sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);
+ rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);
}
/*
@@ -6733,6 +6950,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* die on a /0 trap.
*/
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
/*
* Make sure the first group of this domain contains the
@@ -6865,6 +7083,66 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
}
/*
+ * Check that the per-cpu provided sd energy data is consistent for all cpus
+ * within the mask.
+ */
+static inline void check_sched_energy_data(int cpu, sched_domain_energy_f fn,
+ const struct cpumask *cpumask)
+{
+ const struct sched_group_energy * const sge = fn(cpu);
+ struct cpumask mask;
+ int i;
+
+ if (cpumask_weight(cpumask) <= 1)
+ return;
+
+ cpumask_xor(&mask, cpumask, get_cpu_mask(cpu));
+
+ for_each_cpu(i, &mask) {
+ const struct sched_group_energy * const e = fn(i);
+ int y;
+
+ BUG_ON(e->nr_idle_states != sge->nr_idle_states);
+
+ for (y = 0; y < (e->nr_idle_states); y++) {
+ BUG_ON(e->idle_states[y].power !=
+ sge->idle_states[y].power);
+ }
+
+ BUG_ON(e->nr_cap_states != sge->nr_cap_states);
+
+ for (y = 0; y < (e->nr_cap_states); y++) {
+ BUG_ON(e->cap_states[y].cap != sge->cap_states[y].cap);
+ BUG_ON(e->cap_states[y].power !=
+ sge->cap_states[y].power);
+ }
+ }
+}
+
+static void init_sched_energy(int cpu, struct sched_domain *sd,
+ sched_domain_energy_f fn)
+{
+ if (!(fn && fn(cpu)))
+ return;
+
+ if (cpu != group_balance_cpu(sd->groups))
+ return;
+
+ if (sd->child && !sd->child->groups->sge) {
+ pr_err("BUG: EAS setup broken for CPU%d\n", cpu);
+#ifdef CONFIG_SCHED_DEBUG
+ pr_err(" energy data on %s but not on %s domain\n",
+ sd->name, sd->child->name);
+#endif
+ return;
+ }
+
+ check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups));
+
+ sd->groups->sge = fn(cpu);
+}
+
+/*
* Initializers for schedule domains
* Non-inlined to reduce accumulated stack pressure in build_sched_domains()
*/
@@ -6972,6 +7250,7 @@ static int sched_domains_curr_level;
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
+ * SD_SHARE_CAP_STATES - describes shared capacity states
*
* Odd one out:
* SD_ASYM_PACKING - describes SMT quirks
@@ -6981,7 +7260,8 @@ static int sched_domains_curr_level;
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
- SD_SHARE_POWERDOMAIN)
+ SD_SHARE_POWERDOMAIN | \
+ SD_SHARE_CAP_STATES)
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl, int cpu)
@@ -7534,6 +7814,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
enum s_alloc alloc_state;
struct sched_domain *sd;
struct s_data d;
+ struct rq *rq = NULL;
int i, ret = -ENOMEM;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -7572,10 +7853,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,
/* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ struct sched_domain_topology_level *tl = sched_domain_topology;
+
if (!cpumask_test_cpu(i, cpu_map))
continue;
- for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {
+ if (energy_aware())
+ init_sched_energy(i, sd, tl->energy);
claim_allocations(i, sd);
init_sched_groups_capacity(i, sd);
}
@@ -7584,6 +7869,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
+ rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i);
cpu_attach_domain(sd, d.rd, i);
}
diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c
new file mode 100644
index 000000000000..f6f9b9b3a4a8
--- /dev/null
+++ b/kernel/sched/cpufreq_sched.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright (C) 2015 Michael Turquette <mturquette@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/percpu.h>
+#include <linux/irq_work.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/cpufreq_sched.h>
+
+#include "sched.h"
+
+#define THROTTLE_DOWN_NSEC 50000000 /* 50ms default */
+#define THROTTLE_UP_NSEC 500000 /* 500us default */
+
+struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE;
+static bool __read_mostly cpufreq_driver_slow;
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
+static struct cpufreq_governor cpufreq_gov_sched;
+#endif
+
+static DEFINE_PER_CPU(unsigned long, enabled);
+DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
+
+/**
+ * gov_data - per-policy data internal to the governor
+ * @up_throttle: next throttling period expiry if increasing OPP
+ * @down_throttle: next throttling period expiry if decreasing OPP
+ * @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP
+ * @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP
+ * @task: worker thread for dvfs transition that may block/sleep
+ * @irq_work: callback used to wake up worker thread
+ * @requested_freq: last frequency requested by the sched governor
+ *
+ * struct gov_data is the per-policy cpufreq_sched-specific data structure. A
+ * per-policy instance of it is created when the cpufreq_sched governor receives
+ * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data
+ * member of struct cpufreq_policy.
+ *
+ * Readers of this data must call down_read(policy->rwsem). Writers must
+ * call down_write(policy->rwsem).
+ */
+struct gov_data {
+ ktime_t up_throttle;
+ ktime_t down_throttle;
+ unsigned int up_throttle_nsec;
+ unsigned int down_throttle_nsec;
+ struct task_struct *task;
+ struct irq_work irq_work;
+ unsigned int requested_freq;
+};
+
+static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy,
+ unsigned int freq)
+{
+ struct gov_data *gd = policy->governor_data;
+
+ /* avoid race with cpufreq_sched_stop */
+ if (!down_write_trylock(&policy->rwsem))
+ return;
+
+ __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);
+
+ gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec);
+ gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec);
+ up_write(&policy->rwsem);
+}
+
+static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq)
+{
+ ktime_t now = ktime_get();
+
+ ktime_t throttle = gd->requested_freq < cur_freq ?
+ gd->down_throttle : gd->up_throttle;
+
+ if (ktime_after(now, throttle))
+ return false;
+
+ while (1) {
+ int usec_left = ktime_to_ns(ktime_sub(throttle, now));
+
+ usec_left /= NSEC_PER_USEC;
+ trace_cpufreq_sched_throttled(usec_left);
+ usleep_range(usec_left, usec_left + 100);
+ now = ktime_get();
+ if (ktime_after(now, throttle))
+ return true;
+ }
+}
+
+/*
+ * we pass in struct cpufreq_policy. This is safe because changing out the
+ * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP),
+ * which tears down all of the data structures and __cpufreq_governor(policy,
+ * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the
+ * new policy pointer
+ */
+static int cpufreq_sched_thread(void *data)
+{
+ struct sched_param param;
+ struct cpufreq_policy *policy;
+ struct gov_data *gd;
+ unsigned int new_request = 0;
+ unsigned int last_request = 0;
+ int ret;
+
+ policy = (struct cpufreq_policy *) data;
+ gd = policy->governor_data;
+
+ param.sched_priority = 50;
+ ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, &param);
+ if (ret) {
+ pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+ do_exit(-EINVAL);
+ } else {
+ pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",
+ __func__, gd->task->pid);
+ }
+
+ do {
+ new_request = gd->requested_freq;
+ if (new_request == last_request) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (kthread_should_stop())
+ break;
+ schedule();
+ } else {
+ /*
+ * if the frequency thread sleeps while waiting to be
+ * unthrottled, start over to check for a newer request
+ */
+ if (finish_last_request(gd, policy->cur))
+ continue;
+ last_request = new_request;
+ cpufreq_sched_try_driver_target(policy, new_request);
+ }
+ } while (!kthread_should_stop());
+
+ return 0;
+}
+
+static void cpufreq_sched_irq_work(struct irq_work *irq_work)
+{
+ struct gov_data *gd;
+
+ gd = container_of(irq_work, struct gov_data, irq_work);
+ if (!gd)
+ return;
+
+ wake_up_process(gd->task);
+}
+
+static void update_fdomain_capacity_request(int cpu)
+{
+ unsigned int freq_new, index_new, cpu_tmp;
+ struct cpufreq_policy *policy;
+ struct gov_data *gd;
+ unsigned long capacity = 0;
+
+ /*
+ * Avoid grabbing the policy if possible. A test is still
+ * required after locking the CPU's policy to avoid racing
+ * with the governor changing.
+ */
+ if (!per_cpu(enabled, cpu))
+ return;
+
+ policy = cpufreq_cpu_get(cpu);
+ if (IS_ERR_OR_NULL(policy))
+ return;
+
+ if (policy->governor != &cpufreq_gov_sched ||
+ !policy->governor_data)
+ goto out;
+
+ gd = policy->governor_data;
+
+ /* find max capacity requested by cpus in this policy */
+ for_each_cpu(cpu_tmp, policy->cpus) {
+ struct sched_capacity_reqs *scr;
+
+ scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp);
+ capacity = max(capacity, scr->total);
+ }
+
+ /* Convert the new maximum capacity request into a cpu frequency */
+ freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT;
+ if (cpufreq_frequency_table_target(policy, policy->freq_table,
+ freq_new, CPUFREQ_RELATION_L,
+ &index_new))
+ goto out;
+ freq_new = policy->freq_table[index_new].frequency;
+
+ if (freq_new > policy->max)
+ freq_new = policy->max;
+
+ if (freq_new < policy->min)
+ freq_new = policy->min;
+
+ trace_cpufreq_sched_request_opp(cpu, capacity, freq_new,
+ gd->requested_freq);
+ if (freq_new == gd->requested_freq)
+ goto out;
+
+ gd->requested_freq = freq_new;
+
+ /*
+ * Throttling is not yet supported on platforms with fast cpufreq
+ * drivers.
+ */
+ if (cpufreq_driver_slow)
+ irq_work_queue_on(&gd->irq_work, cpu);
+ else
+ cpufreq_sched_try_driver_target(policy, freq_new);
+
+out:
+ cpufreq_cpu_put(policy);
+}
+
+void update_cpu_capacity_request(int cpu, bool request)
+{
+ unsigned long new_capacity;
+ struct sched_capacity_reqs *scr;
+
+ /* The rq lock serializes access to the CPU's sched_capacity_reqs. */
+ lockdep_assert_held(&cpu_rq(cpu)->lock);
+
+ scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
+
+ new_capacity = scr->cfs + scr->rt;
+ new_capacity = new_capacity * capacity_margin
+ / SCHED_CAPACITY_SCALE;
+ new_capacity += scr->dl;
+
+ if (new_capacity == scr->total)
+ return;
+
+ trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity);
+
+ scr->total = new_capacity;
+ if (request)
+ update_fdomain_capacity_request(cpu);
+}
+
+static inline void set_sched_freq(void)
+{
+ static_key_slow_inc(&__sched_freq);
+}
+
+static inline void clear_sched_freq(void)
+{
+ static_key_slow_dec(&__sched_freq);
+}
+
+static struct attribute_group sched_attr_group_gov_pol;
+static struct attribute_group *get_sysfs_attr(void)
+{
+ return &sched_attr_group_gov_pol;
+}
+
+static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)
+{
+ struct gov_data *gd;
+ int cpu;
+ int rc;
+
+ for_each_cpu(cpu, policy->cpus)
+ memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0,
+ sizeof(struct sched_capacity_reqs));
+
+ gd = kzalloc(sizeof(*gd), GFP_KERNEL);
+ if (!gd)
+ return -ENOMEM;
+
+ gd->up_throttle_nsec = policy->cpuinfo.transition_latency ?
+ policy->cpuinfo.transition_latency :
+ THROTTLE_UP_NSEC;
+ gd->down_throttle_nsec = THROTTLE_DOWN_NSEC;
+ pr_debug("%s: throttle threshold = %u [ns]\n",
+ __func__, gd->up_throttle_nsec);
+
+ rc = sysfs_create_group(get_governor_parent_kobj(policy), get_sysfs_attr());
+ if (rc) {
+ pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc);
+ goto err;
+ }
+
+ policy->governor_data = gd;
+ if (cpufreq_driver_is_slow()) {
+ cpufreq_driver_slow = true;
+ gd->task = kthread_create(cpufreq_sched_thread, policy,
+ "kschedfreq:%d",
+ cpumask_first(policy->related_cpus));
+ if (IS_ERR_OR_NULL(gd->task)) {
+ pr_err("%s: failed to create kschedfreq thread\n",
+ __func__);
+ goto err;
+ }
+ get_task_struct(gd->task);
+ kthread_bind_mask(gd->task, policy->related_cpus);
+ wake_up_process(gd->task);
+ init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);
+ }
+
+ set_sched_freq();
+
+ return 0;
+
+err:
+ policy->governor_data = NULL;
+ kfree(gd);
+ return -ENOMEM;
+}
+
+static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy)
+{
+ struct gov_data *gd = policy->governor_data;
+
+ clear_sched_freq();
+ if (cpufreq_driver_slow) {
+ kthread_stop(gd->task);
+ put_task_struct(gd->task);
+ }
+
+ sysfs_remove_group(get_governor_parent_kobj(policy), get_sysfs_attr());
+
+ policy->governor_data = NULL;
+
+ kfree(gd);
+ return 0;
+}
+
+static int cpufreq_sched_start(struct cpufreq_policy *policy)
+{
+ int cpu;
+
+ for_each_cpu(cpu, policy->cpus)
+ per_cpu(enabled, cpu) = 1;
+
+ return 0;
+}
+
+static void cpufreq_sched_limits(struct cpufreq_policy *policy)
+{
+ unsigned int clamp_freq;
+ struct gov_data *gd = policy->governor_data;;
+
+ pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n",
+ policy->cpu, policy->min, policy->max,
+ policy->cur);
+
+ clamp_freq = clamp(gd->requested_freq, policy->min, policy->max);
+
+ if (policy->cur != clamp_freq)
+ __cpufreq_driver_target(policy, clamp_freq, CPUFREQ_RELATION_L);
+}
+
+static int cpufreq_sched_stop(struct cpufreq_policy *policy)
+{
+ int cpu;
+
+ for_each_cpu(cpu, policy->cpus)
+ per_cpu(enabled, cpu) = 0;
+
+ return 0;
+}
+
+static int cpufreq_sched_setup(struct cpufreq_policy *policy,
+ unsigned int event)
+{
+ switch (event) {
+ case CPUFREQ_GOV_POLICY_INIT:
+ return cpufreq_sched_policy_init(policy);
+ case CPUFREQ_GOV_POLICY_EXIT:
+ return cpufreq_sched_policy_exit(policy);
+ case CPUFREQ_GOV_START:
+ return cpufreq_sched_start(policy);
+ case CPUFREQ_GOV_STOP:
+ return cpufreq_sched_stop(policy);
+ case CPUFREQ_GOV_LIMITS:
+ cpufreq_sched_limits(policy);
+ break;
+ }
+ return 0;
+}
+
+/* Tunables */
+static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf)
+{
+ return sprintf(buf, "%u\n", gd->up_throttle_nsec);
+}
+
+static ssize_t store_up_throttle_nsec(struct gov_data *gd,
+ const char *buf, size_t count)
+{
+ int ret;
+ long unsigned int val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+ gd->up_throttle_nsec = val;
+ return count;
+}
+
+static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf)
+{
+ return sprintf(buf, "%u\n", gd->down_throttle_nsec);
+}
+
+static ssize_t store_down_throttle_nsec(struct gov_data *gd,
+ const char *buf, size_t count)
+{
+ int ret;
+ long unsigned int val;
+
+ ret = kstrtoul(buf, 0, &val);
+ if (ret < 0)
+ return ret;
+ gd->down_throttle_nsec = val;
+ return count;
+}
+
+/*
+ * Create show/store routines
+ * - sys: One governor instance for complete SYSTEM
+ * - pol: One governor instance per struct cpufreq_policy
+ */
+#define show_gov_pol_sys(file_name) \
+static ssize_t show_##file_name##_gov_pol \
+(struct cpufreq_policy *policy, char *buf) \
+{ \
+ return show_##file_name(policy->governor_data, buf); \
+}
+
+#define store_gov_pol_sys(file_name) \
+static ssize_t store_##file_name##_gov_pol \
+(struct cpufreq_policy *policy, const char *buf, size_t count) \
+{ \
+ return store_##file_name(policy->governor_data, buf, count); \
+}
+
+#define gov_pol_attr_rw(_name) \
+ static struct freq_attr _name##_gov_pol = \
+ __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
+
+#define show_store_gov_pol_sys(file_name) \
+ show_gov_pol_sys(file_name); \
+ store_gov_pol_sys(file_name)
+#define tunable_handlers(file_name) \
+ show_gov_pol_sys(file_name); \
+ store_gov_pol_sys(file_name); \
+ gov_pol_attr_rw(file_name)
+
+tunable_handlers(down_throttle_nsec);
+tunable_handlers(up_throttle_nsec);
+
+/* Per policy governor instance */
+static struct attribute *sched_attributes_gov_pol[] = {
+ &up_throttle_nsec_gov_pol.attr,
+ &down_throttle_nsec_gov_pol.attr,
+ NULL,
+};
+
+static struct attribute_group sched_attr_group_gov_pol = {
+ .attrs = sched_attributes_gov_pol,
+ .name = "sched",
+};
+
+#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED
+static
+#endif
+struct cpufreq_governor cpufreq_gov_sched = {
+ .name = "sched",
+ .governor = cpufreq_sched_setup,
+ .owner = THIS_MODULE,
+};
+
+static int __init cpufreq_sched_init(void)
+{
+ int cpu;
+
+ for_each_cpu(cpu, cpu_possible_mask)
+ per_cpu(enabled, cpu) = 0;
+ return cpufreq_register_governor(&cpufreq_gov_sched);
+}
+
+/* Try to make this the default governor */
+fs_initcall(cpufreq_sched_init);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f29b132a9f8b..692d1f888f17 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -611,19 +611,25 @@ static void cputime_adjust(struct task_cputime *curr,
stime = curr->stime;
utime = curr->utime;
- if (utime == 0) {
- stime = rtime;
+ /*
+ * If either stime or both stime and utime are 0, assume all runtime is
+ * userspace. Once a task gets some ticks, the monotonicy code at
+ * 'update' will ensure things converge to the observed ratio.
+ */
+ if (stime == 0) {
+ utime = rtime;
goto update;
}
- if (stime == 0) {
- utime = rtime;
+ if (utime == 0) {
+ stime = rtime;
goto update;
}
stime = scale_stime((__force u64)stime, (__force u64)rtime,
(__force u64)(stime + utime));
+update:
/*
* Make sure stime doesn't go backwards; this preserves monotonicity
* for utime because rtime is monotonic.
@@ -646,7 +652,6 @@ static void cputime_adjust(struct task_cputime *curr,
stime = rtime - utime;
}
-update:
prev->stime = stime;
prev->utime = utime;
out:
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 44178fea87d0..685ae83b2bfa 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -43,6 +43,24 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
return !RB_EMPTY_NODE(&dl_se->rb_node);
}
+static void add_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ u64 se_bw = dl_se->dl_bw;
+
+ dl_rq->avg_bw += se_bw;
+}
+
+static void clear_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ u64 se_bw = dl_se->dl_bw;
+
+ dl_rq->avg_bw -= se_bw;
+ if (dl_rq->avg_bw < 0) {
+ WARN_ON(1);
+ dl_rq->avg_bw = 0;
+ }
+}
+
static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
{
struct sched_dl_entity *dl_se = &p->dl;
@@ -496,6 +514,9 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
+ if (dl_se->dl_new)
+ add_average_bw(dl_se, dl_rq);
+
/*
* The arrival of a new instance needs special treatment, i.e.,
* the actual scheduling parameters have to be "renewed".
@@ -743,8 +764,6 @@ static void update_curr_dl(struct rq *rq)
curr->se.exec_start = rq_clock_task(rq);
cpuacct_charge(curr, delta_exec);
- sched_rt_avg_update(rq, delta_exec);
-
dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
if (dl_runtime_exceeded(dl_se)) {
dl_se->dl_throttled = 1;
@@ -1280,6 +1299,8 @@ static void task_fork_dl(struct task_struct *p)
static void task_dead_dl(struct task_struct *p)
{
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+ struct dl_rq *dl_rq = dl_rq_of_se(&p->dl);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
/*
* Since we are TASK_DEAD we won't slip out of the domain!
@@ -1288,6 +1309,8 @@ static void task_dead_dl(struct task_struct *p)
/* XXX we should retain the bw until 0-lag */
dl_b->total_bw -= p->dl.dl_bw;
raw_spin_unlock_irq(&dl_b->lock);
+
+ clear_average_bw(&p->dl, &rq->dl);
}
static void set_curr_task_dl(struct rq *rq)
@@ -1596,7 +1619,9 @@ retry:
next_task->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(rq, next_task, 0);
+ clear_average_bw(&next_task->dl, &rq->dl);
set_task_cpu(next_task, later_rq->cpu);
+ add_average_bw(&next_task->dl, &later_rq->dl);
activate_task(later_rq, next_task, 0);
next_task->on_rq = TASK_ON_RQ_QUEUED;
ret = 1;
@@ -1686,7 +1711,9 @@ static void pull_dl_task(struct rq *this_rq)
p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src_rq, p, 0);
+ clear_average_bw(&p->dl, &src_rq->dl);
set_task_cpu(p, this_cpu);
+ add_average_bw(&p->dl, &this_rq->dl);
activate_task(this_rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
dmin = p->dl.deadline;
@@ -1793,6 +1820,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
if (!start_dl_timer(p))
__dl_clear_params(p);
+ clear_average_bw(&p->dl, &rq->dl);
+
/*
* Since this might be the only -deadline task on the rq,
* this is the right place to try to pull some other one
diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c
new file mode 100644
index 000000000000..50d183b1e156
--- /dev/null
+++ b/kernel/sched/energy.c
@@ -0,0 +1,134 @@
+/*
+ * Obtain energy cost data from DT and populate relevant scheduler data
+ * structures.
+ *
+ * Copyright (C) 2015 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#define pr_fmt(fmt) "sched-energy: " fmt
+
+#define DEBUG
+
+#include <linux/gfp.h>
+#include <linux/of.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched_energy.h>
+#include <linux/stddef.h>
+
+#include "sched.h"
+
+struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
+bool sched_energy_aware;
+
+static void free_resources(void)
+{
+ int cpu, sd_level;
+ struct sched_group_energy *sge;
+
+ for_each_possible_cpu(cpu) {
+ for_each_possible_sd_level(sd_level) {
+ sge = sge_array[cpu][sd_level];
+ if (sge) {
+ kfree(sge->cap_states);
+ kfree(sge->idle_states);
+ kfree(sge);
+ }
+ }
+ }
+}
+
+void init_sched_energy_costs(void)
+{
+ struct device_node *cn, *cp;
+ struct capacity_state *cap_states;
+ struct idle_state *idle_states;
+ struct sched_group_energy *sge;
+ const struct property *prop;
+ int sd_level, i, nstates, cpu;
+ const __be32 *val;
+
+ if (!energy_aware()) {
+ sched_energy_aware = false;
+ return;
+ }
+
+ sched_energy_aware = true;
+
+ for_each_possible_cpu(cpu) {
+ cn = of_get_cpu_node(cpu, NULL);
+ if (!cn) {
+ pr_warn("CPU device node missing for CPU %d\n", cpu);
+ return;
+ }
+
+ if (!of_find_property(cn, "sched-energy-costs", NULL)) {
+ pr_warn("CPU device node has no sched-energy-costs\n");
+ return;
+ }
+
+ for_each_possible_sd_level(sd_level) {
+ cp = of_parse_phandle(cn, "sched-energy-costs", sd_level);
+ if (!cp)
+ break;
+
+ prop = of_find_property(cp, "busy-cost-data", NULL);
+ if (!prop || !prop->value) {
+ pr_warn("No busy-cost data, skipping sched_energy init\n");
+ goto out;
+ }
+
+ sge = kcalloc(1, sizeof(struct sched_group_energy),
+ GFP_NOWAIT);
+
+ nstates = (prop->length / sizeof(u32)) / 2;
+ cap_states = kcalloc(nstates,
+ sizeof(struct capacity_state),
+ GFP_NOWAIT);
+
+ for (i = 0, val = prop->value; i < nstates; i++) {
+ cap_states[i].cap = be32_to_cpup(val++);
+ cap_states[i].power = be32_to_cpup(val++);
+ }
+
+ sge->nr_cap_states = nstates;
+ sge->cap_states = cap_states;
+
+ prop = of_find_property(cp, "idle-cost-data", NULL);
+ if (!prop || !prop->value) {
+ pr_warn("No idle-cost data, skipping sched_energy init\n");
+ goto out;
+ }
+
+ nstates = (prop->length / sizeof(u32));
+ idle_states = kcalloc(nstates,
+ sizeof(struct idle_state),
+ GFP_NOWAIT);
+
+ for (i = 0, val = prop->value; i < nstates; i++)
+ idle_states[i].power = be32_to_cpup(val++);
+
+ sge->nr_idle_states = nstates;
+ sge->idle_states = idle_states;
+
+ sge_array[cpu][sd_level] = sge;
+ }
+ }
+
+ pr_info("Sched-energy-costs installed from DT\n");
+ return;
+
+out:
+ free_resources();
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3db77aff2433..14acefa27ec1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -30,10 +30,11 @@
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
+#include <linux/module.h>
#include "sched.h"
#include <trace/events/sched.h>
-
+#include "tune.h"
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -49,6 +50,11 @@
unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+unsigned int sysctl_sched_is_big_little = 0;
+unsigned int sysctl_sched_sync_hint_enable = 1;
+unsigned int sysctl_sched_initial_task_util = 0;
+unsigned int sysctl_sched_cstate_aware = 1;
+
/*
* The initial- and re-scaling of tunables is configurable
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -692,13 +698,13 @@ void init_entity_runnable_average(struct sched_entity *se)
sa->period_contrib = 1023;
sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
+ sa->util_avg = sched_freq() ?
+ sysctl_sched_initial_task_util :
+ scale_load_down(SCHED_LOAD_SCALE);
sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
#else
void init_entity_runnable_average(struct sched_entity *se)
{
@@ -1230,8 +1236,6 @@ static void task_numa_assign(struct task_numa_env *env,
{
if (env->best_task)
put_task_struct(env->best_task);
- if (p)
- get_task_struct(p);
env->best_task = p;
env->best_imp = imp;
@@ -1299,20 +1303,30 @@ static void task_numa_compare(struct task_numa_env *env,
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
int dist = env->dist;
+ bool assigned = false;
rcu_read_lock();
raw_spin_lock_irq(&dst_rq->lock);
cur = dst_rq->curr;
/*
- * No need to move the exiting task, and this ensures that ->curr
- * wasn't reaped and thus get_task_struct() in task_numa_assign()
- * is safe under RCU read lock.
- * Note that rcu_read_lock() itself can't protect from the final
- * put_task_struct() after the last schedule().
+ * No need to move the exiting task or idle task.
*/
if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL;
+ else {
+ /*
+ * The task_struct must be protected here to protect the
+ * p->numa_faults access in the task_weight since the
+ * numa_faults could already be freed in the following path:
+ * finish_task_switch()
+ * --> put_task_struct()
+ * --> __put_task_struct()
+ * --> task_numa_free()
+ */
+ get_task_struct(cur);
+ }
+
raw_spin_unlock_irq(&dst_rq->lock);
/*
@@ -1396,6 +1410,7 @@ balance:
*/
if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1;
+ put_task_struct(cur);
cur = NULL;
goto assign;
}
@@ -1421,9 +1436,16 @@ balance:
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
assign:
+ assigned = true;
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
+ /*
+ * The dst_rq->curr isn't assigned. The protection for task_struct is
+ * finished.
+ */
+ if (cur && !assigned)
+ put_task_struct(cur);
}
static void task_numa_find_cpu(struct task_numa_env *env,
@@ -3671,6 +3693,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
scale_freq = arch_scale_freq_capacity(NULL, cpu);
scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+ trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
/* delta_w is the amount already accumulated against our next period */
delta_w = sa->period_contrib;
@@ -3832,6 +3855,10 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
update_tg_load_avg(cfs_rq, 0);
+
+ if (entity_is_task(se))
+ trace_sched_load_avg_task(task_of(se), &se->avg);
+ trace_sched_load_avg_cpu(cpu, cfs_rq);
}
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -3912,27 +3939,45 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
}
-/*
- * Task first catches up with cfs_rq, and then subtract
- * itself from the cfs_rq (task must be off the queue now).
- */
-void remove_entity_load_avg(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 last_update_time;
-
#ifndef CONFIG_64BIT
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
u64 last_update_time_copy;
+ u64 last_update_time;
do {
last_update_time_copy = cfs_rq->load_last_update_time_copy;
smp_rmb();
last_update_time = cfs_rq->avg.last_update_time;
} while (last_update_time != last_update_time_copy);
+
+ return last_update_time;
+}
#else
- last_update_time = cfs_rq->avg.last_update_time;
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->avg.last_update_time;
+}
#endif
+/*
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
+ */
+void remove_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+ /*
+ * Newly created task or never used group entity should not be removed
+ * from its (source) cfs_rq
+ */
+ if (se->avg.last_update_time == 0)
+ return;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+
__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
@@ -5289,6 +5334,30 @@ static inline void hrtick_update(struct rq *rq)
}
#endif
+#ifdef CONFIG_SMP
+static bool cpu_overutilized(int cpu);
+static inline unsigned long boosted_cpu_util(int cpu);
+#else
+#define boosted_cpu_util(cpu) cpu_util(cpu)
+#endif
+
+#if defined(CONFIG_SMP) && defined(CONFIG_CPU_FREQ_GOV_SCHED)
+static void update_capacity_of(int cpu)
+{
+ unsigned long req_cap;
+
+ if (!sched_freq())
+ return;
+
+ /* Convert scale-invariant capacity to cpu. */
+ req_cap = boosted_cpu_util(cpu);
+ req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
+ set_cfs_cpu_capacity(cpu, true, req_cap);
+}
+#else
+#define update_capacity_of(X) do {} while(0)
+#endif /* SMP and CPU_FREQ_GOV_SCHED */
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -5299,6 +5368,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+#ifdef CONFIG_SMP
+ int task_new = flags & ENQUEUE_WAKEUP_NEW;
+ int task_wakeup = flags & ENQUEUE_WAKEUP;
+#endif
for_each_sched_entity(se) {
if (se->on_rq)
@@ -5336,6 +5409,50 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
add_nr_running(rq, 1);
inc_rq_hmp_stats(rq, p, 1);
}
+
+#ifdef CONFIG_SMP
+
+ /*
+ * Update SchedTune accounting.
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ *
+ * We do it also in the case where we enqueue a throttled task;
+ * we could argue that a throttled task should not boost a CPU,
+ * however:
+ * a) properly implementing CPU boosting considering throttled
+ * tasks will increase a lot the complexity of the solution
+ * b) it's not easy to quantify the benefits introduced by
+ * such a more complex solution.
+ * Thus, for the time being we go for the simple solution and boost
+ * also for throttled RQs.
+ */
+ schedtune_enqueue_task(p, cpu_of(rq));
+
+ if (energy_aware() && !se) {
+ if (!task_new && !rq->rd->overutilized &&
+ cpu_overutilized(rq->cpu)) {
+ rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
+
+ }
+
+ if (!se) {
+ /*
+ * We want to potentially trigger a freq switch
+ * request only for tasks that are waking up; this is
+ * because we get here also during load balancing, but
+ * in these cases it seems wise to trigger as single
+ * request after load balancing is done.
+ */
+ if (task_new || task_wakeup)
+ update_capacity_of(cpu_of(rq));
+ }
+
+#endif /* CONFIG_SMP */
hrtick_update(rq);
}
@@ -5399,6 +5516,37 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
sub_nr_running(rq, 1);
dec_rq_hmp_stats(rq, p, 1);
}
+
+#ifdef CONFIG_SMP
+
+ /*
+ * Update SchedTune accounting
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ */
+ schedtune_dequeue_task(p, cpu_of(rq));
+
+ if (!se) {
+ /*
+ * We want to potentially trigger a freq switch
+ * request only for tasks that are going to sleep;
+ * this is because we get here also during load
+ * balancing, but in these cases it seems wise to
+ * trigger as single request after load balancing is
+ * done.
+ */
+ if (task_sleep) {
+ if (rq->cfs.nr_running)
+ update_capacity_of(cpu_of(rq));
+ else if (sched_freq())
+ set_cfs_cpu_capacity(cpu_of(rq), false, 0);
+ }
+ }
+
+#endif /* CONFIG_SMP */
+
hrtick_update(rq);
}
@@ -5625,15 +5773,6 @@ static unsigned long target_load(int cpu, int type)
return max(rq->cpu_load[type-1], total);
}
-static unsigned long capacity_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity;
-}
-
-static unsigned long capacity_orig_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_capacity_orig;
-}
static unsigned long cpu_avg_load_per_task(int cpu)
{
@@ -5746,19 +5885,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
return wl;
for_each_sched_entity(se) {
- long w, W;
+ struct cfs_rq *cfs_rq = se->my_q;
+ long W, w = cfs_rq_load_avg(cfs_rq);
- tg = se->my_q->tg;
+ tg = cfs_rq->tg;
/*
* W = @wg + \Sum rw_j
*/
- W = wg + calc_tg_weight(tg, se->my_q);
+ W = wg + atomic_long_read(&tg->load_avg);
+
+ /* Ensure \Sum rw_j >= rw_i */
+ W -= cfs_rq->tg_load_avg_contrib;
+ W += w;
/*
* w = rw_i + @wl
*/
- w = cfs_rq_load_avg(se->my_q) + wl;
+ w += wl;
/*
* wl = S * s'_i; see (2)
@@ -5803,6 +5947,387 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
#endif
/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and freq scaling.
+ */
+unsigned long capacity_curr_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig *
+ arch_scale_freq_capacity(NULL, cpu)
+ >> SCHED_CAPACITY_SHIFT;
+}
+
+struct energy_env {
+ struct sched_group *sg_top;
+ struct sched_group *sg_cap;
+ int cap_idx;
+ int util_delta;
+ int src_cpu;
+ int dst_cpu;
+ int energy;
+ int payoff;
+ struct task_struct *task;
+ struct {
+ int before;
+ int after;
+ int delta;
+ int diff;
+ } nrg;
+ struct {
+ int before;
+ int after;
+ int delta;
+ } cap;
+};
+
+/*
+ * __cpu_norm_util() returns the cpu util relative to a specific capacity,
+ * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
+ * energy calculations. Using the scale-invariant util returned by
+ * cpu_util() and approximating scale-invariant util by:
+ *
+ * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
+ *
+ * the normalized util can be found using the specific capacity.
+ *
+ * capacity = capacity_orig * curr_freq/max_freq
+ *
+ * norm_util = running_time/time ~ util/capacity
+ */
+static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta)
+{
+ int util = __cpu_util(cpu, delta);
+
+ if (util >= capacity)
+ return SCHED_CAPACITY_SCALE;
+
+ return (util << SCHED_CAPACITY_SHIFT)/capacity;
+}
+
+static int calc_util_delta(struct energy_env *eenv, int cpu)
+{
+ if (cpu == eenv->src_cpu)
+ return -eenv->util_delta;
+ if (cpu == eenv->dst_cpu)
+ return eenv->util_delta;
+ return 0;
+}
+
+static
+unsigned long group_max_util(struct energy_env *eenv)
+{
+ int i, delta;
+ unsigned long max_util = 0;
+
+ for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) {
+ delta = calc_util_delta(eenv, i);
+ max_util = max(max_util, __cpu_util(i, delta));
+ }
+
+ return max_util;
+}
+
+/*
+ * group_norm_util() returns the approximated group util relative to it's
+ * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in
+ * energy calculations. Since task executions may or may not overlap in time in
+ * the group the true normalized util is between max(cpu_norm_util(i)) and
+ * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
+ * latter is used as the estimate as it leads to a more pessimistic energy
+ * estimate (more busy).
+ */
+static unsigned
+long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
+{
+ int i, delta;
+ unsigned long util_sum = 0;
+ unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
+
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ delta = calc_util_delta(eenv, i);
+ util_sum += __cpu_norm_util(i, capacity, delta);
+ }
+
+ if (util_sum > SCHED_CAPACITY_SCALE)
+ return SCHED_CAPACITY_SCALE;
+ return util_sum;
+}
+
+static int find_new_capacity(struct energy_env *eenv,
+ const struct sched_group_energy const *sge)
+{
+ int idx;
+ unsigned long util = group_max_util(eenv);
+
+ for (idx = 0; idx < sge->nr_cap_states; idx++) {
+ if (sge->cap_states[idx].cap >= util)
+ break;
+ }
+
+ eenv->cap_idx = idx;
+
+ return idx;
+}
+
+static int group_idle_state(struct sched_group *sg)
+{
+ int i, state = INT_MAX;
+
+ /* Find the shallowest idle state in the sched group. */
+ for_each_cpu(i, sched_group_cpus(sg))
+ state = min(state, idle_get_state_idx(cpu_rq(i)));
+
+ /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
+ state++;
+
+ return state;
+}
+
+/*
+ * sched_group_energy(): Computes the absolute energy consumption of cpus
+ * belonging to the sched_group including shared resources shared only by
+ * members of the group. Iterates over all cpus in the hierarchy below the
+ * sched_group starting from the bottom working it's way up before going to
+ * the next cpu until all cpus are covered at all levels. The current
+ * implementation is likely to gather the same util statistics multiple times.
+ * This can probably be done in a faster but more complex way.
+ * Note: sched_group_energy() may fail when racing with sched_domain updates.
+ */
+static int sched_group_energy(struct energy_env *eenv)
+{
+ struct sched_domain *sd;
+ int cpu, total_energy = 0;
+ struct cpumask visit_cpus;
+ struct sched_group *sg;
+
+ WARN_ON(!eenv->sg_top->sge);
+
+ cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
+
+ while (!cpumask_empty(&visit_cpus)) {
+ struct sched_group *sg_shared_cap = NULL;
+
+ cpu = cpumask_first(&visit_cpus);
+
+ /*
+ * Is the group utilization affected by cpus outside this
+ * sched_group?
+ */
+ sd = rcu_dereference(per_cpu(sd_scs, cpu));
+
+ if (!sd)
+ /*
+ * We most probably raced with hotplug; returning a
+ * wrong energy estimation is better than entering an
+ * infinite loop.
+ */
+ return -EINVAL;
+
+ if (sd->parent)
+ sg_shared_cap = sd->parent->groups;
+
+ for_each_domain(cpu, sd) {
+ sg = sd->groups;
+
+ /* Has this sched_domain already been visited? */
+ if (sd->child && group_first_cpu(sg) != cpu)
+ break;
+
+ do {
+ unsigned long group_util;
+ int sg_busy_energy, sg_idle_energy;
+ int cap_idx, idle_idx;
+
+ if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
+ eenv->sg_cap = sg_shared_cap;
+ else
+ eenv->sg_cap = sg;
+
+ cap_idx = find_new_capacity(eenv, sg->sge);
+
+ if (sg->group_weight == 1) {
+ /* Remove capacity of src CPU (before task move) */
+ if (eenv->util_delta == 0 &&
+ cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
+ eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
+ eenv->cap.delta -= eenv->cap.before;
+ }
+ /* Add capacity of dst CPU (after task move) */
+ if (eenv->util_delta != 0 &&
+ cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
+ eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
+ eenv->cap.delta += eenv->cap.after;
+ }
+ }
+
+ idle_idx = group_idle_state(sg);
+ group_util = group_norm_util(eenv, sg);
+ sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
+ >> SCHED_CAPACITY_SHIFT;
+ sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
+ * sg->sge->idle_states[idle_idx].power)
+ >> SCHED_CAPACITY_SHIFT;
+
+ total_energy += sg_busy_energy + sg_idle_energy;
+
+ if (!sd->child)
+ cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
+
+ if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
+ goto next_cpu;
+
+ } while (sg = sg->next, sg != sd->groups);
+ }
+next_cpu:
+ cpumask_clear_cpu(cpu, &visit_cpus);
+ continue;
+ }
+
+ eenv->energy = total_energy;
+ return 0;
+}
+
+static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
+{
+ return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
+}
+
+/*
+ * energy_diff(): Estimate the energy impact of changing the utilization
+ * distribution. eenv specifies the change: utilisation amount, source, and
+ * destination cpu. Source or destination cpu may be -1 in which case the
+ * utilization is removed from or added to the system (e.g. task wake-up). If
+ * both are specified, the utilization is migrated.
+ */
+static inline int __energy_diff(struct energy_env *eenv)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+ int sd_cpu = -1, energy_before = 0, energy_after = 0;
+
+ struct energy_env eenv_before = {
+ .util_delta = 0,
+ .src_cpu = eenv->src_cpu,
+ .dst_cpu = eenv->dst_cpu,
+ .nrg = { 0, 0, 0, 0},
+ .cap = { 0, 0, 0 },
+ };
+
+ if (eenv->src_cpu == eenv->dst_cpu)
+ return 0;
+
+ sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
+ sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
+
+ if (!sd)
+ return 0; /* Error */
+
+ sg = sd->groups;
+
+ do {
+ if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
+ eenv_before.sg_top = eenv->sg_top = sg;
+
+ if (sched_group_energy(&eenv_before))
+ return 0; /* Invalid result abort */
+ energy_before += eenv_before.energy;
+
+ /* Keep track of SRC cpu (before) capacity */
+ eenv->cap.before = eenv_before.cap.before;
+ eenv->cap.delta = eenv_before.cap.delta;
+
+ if (sched_group_energy(eenv))
+ return 0; /* Invalid result abort */
+ energy_after += eenv->energy;
+ }
+ } while (sg = sg->next, sg != sd->groups);
+
+ eenv->nrg.before = energy_before;
+ eenv->nrg.after = energy_after;
+ eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
+ eenv->payoff = 0;
+
+ trace_sched_energy_diff(eenv->task,
+ eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+ eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+ eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+ eenv->nrg.delta, eenv->payoff);
+
+ return eenv->nrg.diff;
+}
+
+#ifdef CONFIG_SCHED_TUNE
+
+struct target_nrg schedtune_target_nrg;
+
+/*
+ * System energy normalization
+ * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * corresponding to the specified energy variation.
+ */
+static inline int
+normalize_energy(int energy_diff)
+{
+ u32 normalized_nrg;
+#ifdef CONFIG_SCHED_DEBUG
+ int max_delta;
+
+ /* Check for boundaries */
+ max_delta = schedtune_target_nrg.max_power;
+ max_delta -= schedtune_target_nrg.min_power;
+ WARN_ON(abs(energy_diff) >= max_delta);
+#endif
+
+ /* Do scaling using positive numbers to increase the range */
+ normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
+
+ /* Scale by energy magnitude */
+ normalized_nrg <<= SCHED_LOAD_SHIFT;
+
+ /* Normalize on max energy for target platform */
+ normalized_nrg = reciprocal_divide(
+ normalized_nrg, schedtune_target_nrg.rdiv);
+
+ return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
+}
+
+static inline int
+energy_diff(struct energy_env *eenv)
+{
+ int boost = schedtune_task_boost(eenv->task);
+ int nrg_delta;
+
+ /* Conpute "absolute" energy diff */
+ __energy_diff(eenv);
+
+ /* Return energy diff when boost margin is 0 */
+ if (boost == 0)
+ return eenv->nrg.diff;
+
+ /* Compute normalized energy diff */
+ nrg_delta = normalize_energy(eenv->nrg.diff);
+ eenv->nrg.delta = nrg_delta;
+
+ eenv->payoff = schedtune_accept_deltas(
+ eenv->nrg.delta,
+ eenv->cap.delta,
+ eenv->task);
+
+ /*
+ * When SchedTune is enabled, the energy_diff() function will return
+ * the computed energy payoff value. Since the energy_diff() return
+ * value is expected to be negative by its callers, this evaluation
+ * function return a negative value each time the evaluation return a
+ * positive payoff, which is the condition for the acceptance of
+ * a scheduling decision
+ */
+ return -eenv->payoff;
+}
+#else /* CONFIG_SCHED_TUNE */
+#define energy_diff(eenv) __energy_diff(eenv)
+#endif
+
+/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
* A waker of many should wake a different task than the one last awakened
* at a frequency roughly N times higher than one of its wakees. In order
@@ -5893,6 +6418,154 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
return 1;
}
+static inline unsigned long task_util(struct task_struct *p)
+{
+ return p->se.avg.util_avg;
+}
+
+unsigned int capacity_margin = 1280; /* ~20% margin */
+
+static inline unsigned long boosted_task_util(struct task_struct *task);
+
+static inline bool __task_fits(struct task_struct *p, int cpu, int util)
+{
+ unsigned long capacity = capacity_of(cpu);
+
+ util += boosted_task_util(p);
+
+ return (capacity * 1024) > (util * capacity_margin);
+}
+
+static inline bool task_fits_max(struct task_struct *p, int cpu)
+{
+ unsigned long capacity = capacity_of(cpu);
+ unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
+
+ if (capacity == max_capacity)
+ return true;
+
+ if (capacity * capacity_margin > max_capacity * 1024)
+ return true;
+
+ return __task_fits(p, cpu, 0);
+}
+
+static inline bool task_fits_spare(struct task_struct *p, int cpu)
+{
+ return __task_fits(p, cpu, cpu_util(cpu));
+}
+
+static bool cpu_overutilized(int cpu)
+{
+ return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+}
+
+#ifdef CONFIG_SCHED_TUNE
+
+static long
+schedtune_margin(unsigned long signal, long boost)
+{
+ long long margin = 0;
+
+ /*
+ * Signal proportional compensation (SPC)
+ *
+ * The Boost (B) value is used to compute a Margin (M) which is
+ * proportional to the complement of the original Signal (S):
+ * M = B * (SCHED_LOAD_SCALE - S), if B is positive
+ * M = B * S, if B is negative
+ * The obtained M could be used by the caller to "boost" S.
+ */
+ if (boost >= 0) {
+ margin = SCHED_LOAD_SCALE - signal;
+ margin *= boost;
+ } else
+ margin = -signal * boost;
+ /*
+ * Fast integer division by constant:
+ * Constant : (C) = 100
+ * Precision : 0.1% (P) = 0.1
+ * Reference : C * 100 / P (R) = 100000
+ *
+ * Thus:
+ * Shift bits : ceil(log(R,2)) (S) = 17
+ * Mult const : round(2^S/C) (M) = 1311
+ *
+ *
+ */
+ margin *= 1311;
+ margin >>= 17;
+
+ if (boost < 0)
+ margin *= -1;
+ return margin;
+}
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+ int boost = schedtune_cpu_boost(cpu);
+
+ if (boost == 0)
+ return 0;
+
+ return schedtune_margin(util, boost);
+}
+
+static inline long
+schedtune_task_margin(struct task_struct *task)
+{
+ int boost = schedtune_task_boost(task);
+ unsigned long util;
+ long margin;
+
+ if (boost == 0)
+ return 0;
+
+ util = task_util(task);
+ margin = schedtune_margin(util, boost);
+
+ return margin;
+}
+
+#else /* CONFIG_SCHED_TUNE */
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+ return 0;
+}
+
+static inline int
+schedtune_task_margin(struct task_struct *task)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SCHED_TUNE */
+
+static inline unsigned long
+boosted_cpu_util(int cpu)
+{
+ unsigned long util = cpu_util(cpu);
+ long margin = schedtune_cpu_margin(util, cpu);
+
+ trace_sched_boost_cpu(cpu, util, margin);
+
+ return util + margin;
+}
+
+static inline unsigned long
+boosted_task_util(struct task_struct *task)
+{
+ unsigned long util = task_util(task);
+ long margin = schedtune_task_margin(task);
+
+ trace_sched_boost_task(task, util, margin);
+
+ return util + margin;
+}
+
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
@@ -5902,7 +6575,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
+ struct sched_group *fit_group = NULL, *spare_group = NULL;
unsigned long min_load = ULONG_MAX, this_load = 0;
+ unsigned long fit_capacity = ULONG_MAX;
+ unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
@@ -5910,7 +6586,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
load_idx = sd->wake_idx;
do {
- unsigned long load, avg_load;
+ unsigned long load, avg_load, spare_capacity;
int local_group;
int i;
@@ -5933,6 +6609,25 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
load = target_load(i, load_idx);
avg_load += load;
+
+ /*
+ * Look for most energy-efficient group that can fit
+ * that can fit the task.
+ */
+ if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
+ fit_capacity = capacity_of(i);
+ fit_group = group;
+ }
+
+ /*
+ * Look for group which has most spare capacity on a
+ * single cpu.
+ */
+ spare_capacity = capacity_of(i) - cpu_util(i);
+ if (spare_capacity > max_spare_capacity) {
+ max_spare_capacity = spare_capacity;
+ spare_group = group;
+ }
}
/* Adjust by relative CPU capacity of the group */
@@ -5946,6 +6641,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
}
} while (group = group->next, group != sd->groups);
+ if (fit_group)
+ return fit_group;
+
+ if (spare_group)
+ return spare_group;
+
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
return idlest;
@@ -5966,7 +6667,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
- if (idle_cpu(i)) {
+ if (task_fits_spare(p, i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
@@ -5978,7 +6679,8 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
min_exit_latency = idle->exit_latency;
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
- } else if ((!idle || idle->exit_latency == min_exit_latency) &&
+ } else if (idle_cpu(i) &&
+ (!idle || idle->exit_latency == min_exit_latency) &&
rq->idle_stamp > latest_idle_timestamp) {
/*
* If equal or no active idle state, then
@@ -5987,6 +6689,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
*/
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
+ } else if (shallowest_idle_cpu == -1) {
+ /*
+ * If we haven't found an idle CPU yet
+ * pick a non-idle one that can fit the task as
+ * fallback.
+ */
+ shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(i);
@@ -6008,15 +6717,20 @@ static int select_idle_sibling(struct task_struct *p, int target)
struct sched_domain *sd;
struct sched_group *sg;
int i = task_cpu(p);
+ int best_idle = -1;
+ int best_idle_cstate = -1;
+ int best_idle_capacity = INT_MAX;
- if (idle_cpu(target))
- return target;
+ if (!sysctl_sched_cstate_aware) {
+ if (idle_cpu(target))
+ return target;
- /*
- * If the prevous cpu is cache affine and idle, don't be stupid.
- */
- if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- return i;
+ /*
+ * If the prevous cpu is cache affine and idle, don't be stupid.
+ */
+ if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
+ return i;
+ }
if (!sysctl_sched_wake_to_idle &&
!(current->flags & PF_WAKE_UP_IDLE) &&
@@ -6034,54 +6748,256 @@ static int select_idle_sibling(struct task_struct *p, int target)
tsk_cpus_allowed(p)))
goto next;
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
- goto next;
- }
+ if (sysctl_sched_cstate_aware) {
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+ struct rq *rq = cpu_rq(i);
+ int idle_idx = idle_get_state_idx(rq);
+ unsigned long new_usage = boosted_task_util(p);
+ unsigned long capacity_orig = capacity_orig_of(i);
+ if (new_usage > capacity_orig || !idle_cpu(i))
+ goto next;
+
+ if (i == target && new_usage <= capacity_curr_of(target))
+ return target;
+
+ if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
+ best_idle = i;
+ best_idle_cstate = idle_idx;
+ best_idle_capacity = capacity_orig;
+ }
+ }
+ } else {
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ if (i == target || !idle_cpu(i))
+ goto next;
+ }
- target = cpumask_first_and(sched_group_cpus(sg),
+ target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
- goto done;
+ goto done;
+ }
next:
sg = sg->next;
} while (sg != sd->groups);
}
+ if (best_idle > 0)
+ target = best_idle;
+
done:
return target;
}
-/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- */
-static int cpu_util(int cpu)
+static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
{
- unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
- unsigned long capacity = capacity_orig_of(cpu);
+ int iter_cpu;
+ int target_cpu = -1;
+ int target_util = 0;
+ int backup_capacity = 0;
+ int best_idle_cpu = -1;
+ int best_idle_cstate = INT_MAX;
+ int backup_cpu = -1;
+ unsigned long task_util_boosted, new_util;
+
+ task_util_boosted = boosted_task_util(p);
+ for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
+ int cur_capacity;
+ struct rq *rq;
+ int idle_idx;
+
+ /*
+ * Iterate from higher cpus for boosted tasks.
+ */
+ int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
+
+ if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
+ continue;
+
+ /*
+ * p's blocked utilization is still accounted for on prev_cpu
+ * so prev_cpu will receive a negative bias due to the double
+ * accounting. However, the blocked utilization may be zero.
+ */
+ new_util = cpu_util(i) + task_util_boosted;
+
+ /*
+ * Ensure minimum capacity to grant the required boost.
+ * The target CPU can be already at a capacity level higher
+ * than the one required to boost the task.
+ */
+ if (new_util > capacity_orig_of(i))
+ continue;
+
+ /*
+ * Unconditionally favoring tasks that prefer idle cpus to
+ * improve latency.
+ */
+ if (idle_cpu(i) && prefer_idle) {
+ if (best_idle_cpu < 0)
+ best_idle_cpu = i;
+ continue;
+ }
+
+ cur_capacity = capacity_curr_of(i);
+ rq = cpu_rq(i);
+ idle_idx = idle_get_state_idx(rq);
+
+ if (new_util < cur_capacity) {
+ if (cpu_rq(i)->nr_running) {
+ if(prefer_idle) {
+ // Find a target cpu with lowest
+ // utilization.
+ if (target_util == 0 ||
+ target_util < new_util) {
+ target_cpu = i;
+ target_util = new_util;
+ }
+ } else {
+ // Find a target cpu with highest
+ // utilization.
+ if (target_util == 0 ||
+ target_util > new_util) {
+ target_cpu = i;
+ target_util = new_util;
+ }
+ }
+ } else if (!prefer_idle) {
+ if (best_idle_cpu < 0 ||
+ (sysctl_sched_cstate_aware &&
+ best_idle_cstate > idle_idx)) {
+ best_idle_cstate = idle_idx;
+ best_idle_cpu = i;
+ }
+ }
+ } else if (backup_capacity == 0 ||
+ backup_capacity > cur_capacity) {
+ // Find a backup cpu with least capacity.
+ backup_capacity = cur_capacity;
+ backup_cpu = i;
+ }
+ }
+
+ if (prefer_idle && best_idle_cpu >= 0)
+ target_cpu = best_idle_cpu;
+ else if (target_cpu < 0)
+ target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
+
+ return target_cpu;
+}
+
+static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg, *sg_target;
+ int target_max_cap = INT_MAX;
+ int target_cpu = task_cpu(p);
+ unsigned long task_util_boosted, new_util;
+ int i;
+
+ if (sysctl_sched_sync_hint_enable && sync) {
+ int cpu = smp_processor_id();
+ cpumask_t search_cpus;
+ cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
+ if (cpumask_test_cpu(cpu, &search_cpus))
+ return cpu;
+ }
+
+ sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
+
+ if (!sd)
+ return target;
+
+ sg = sd->groups;
+ sg_target = sg;
+
+ if (sysctl_sched_is_big_little) {
+
+ /*
+ * Find group with sufficient capacity. We only get here if no cpu is
+ * overutilized. We may end up overutilizing a cpu by adding the task,
+ * but that should not be any worse than select_idle_sibling().
+ * load_balance() should sort it out later as we get above the tipping
+ * point.
+ */
+ do {
+ /* Assuming all cpus are the same in group */
+ int max_cap_cpu = group_first_cpu(sg);
+
+ /*
+ * Assume smaller max capacity means more energy-efficient.
+ * Ideally we should query the energy model for the right
+ * answer but it easily ends up in an exhaustive search.
+ */
+ if (capacity_of(max_cap_cpu) < target_max_cap &&
+ task_fits_max(p, max_cap_cpu)) {
+ sg_target = sg;
+ target_max_cap = capacity_of(max_cap_cpu);
+ }
+ } while (sg = sg->next, sg != sd->groups);
+
+ task_util_boosted = boosted_task_util(p);
+ /* Find cpu with sufficient capacity */
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
+ /*
+ * p's blocked utilization is still accounted for on prev_cpu
+ * so prev_cpu will receive a negative bias due to the double
+ * accounting. However, the blocked utilization may be zero.
+ */
+ new_util = cpu_util(i) + task_util_boosted;
+
+ /*
+ * Ensure minimum capacity to grant the required boost.
+ * The target CPU can be already at a capacity level higher
+ * than the one required to boost the task.
+ */
+ if (new_util > capacity_orig_of(i))
+ continue;
+
+ if (new_util < capacity_curr_of(i)) {
+ target_cpu = i;
+ if (cpu_rq(i)->nr_running)
+ break;
+ }
+
+ /* cpu has capacity at higher OPP, keep it as fallback */
+ if (target_cpu == task_cpu(p))
+ target_cpu = i;
+ }
+ } else {
+ /*
+ * Find a cpu with sufficient capacity
+ */
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ bool boosted = schedtune_task_boost(p) > 0;
+ bool prefer_idle = schedtune_prefer_idle(p) > 0;
+#else
+ bool boosted = 0;
+ bool prefer_idle = 0;
+#endif
+ int tmp_target = find_best_target(p, boosted, prefer_idle);
+ if (tmp_target >= 0) {
+ target_cpu = tmp_target;
+ if ((boosted || prefer_idle) && idle_cpu(target_cpu))
+ return target_cpu;
+ }
+ }
+
+ if (target_cpu != task_cpu(p)) {
+ struct energy_env eenv = {
+ .util_delta = task_util(p),
+ .src_cpu = task_cpu(p),
+ .dst_cpu = target_cpu,
+ .task = p,
+ };
+
+ /* Not enough spare capacity on previous cpu */
+ if (cpu_overutilized(task_cpu(p)))
+ return target_cpu;
+
+ if (energy_diff(&eenv) >= 0)
+ return task_cpu(p);
+ }
- return (util >= capacity) ? capacity : util;
+ return target_cpu;
}
/*
@@ -6109,7 +7025,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
return select_best_cpu(p, prev_cpu, 0, sync);
if (sd_flag & SD_BALANCE_WAKE)
- want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
+ cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
+ energy_aware();
rcu_read_lock();
for_each_domain(cpu, tmp) {
@@ -6139,7 +7057,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
}
if (!sd) {
- if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+ if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
+ new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync);
+ else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, new_cpu);
} else while (sd) {
@@ -6209,6 +7129,8 @@ static void task_dead_fair(struct task_struct *p)
{
remove_entity_load_avg(&p->se);
}
+#else
+#define task_fits_max(p, cpu) true
#endif /* CONFIG_SMP */
static unsigned long
@@ -6455,6 +7377,8 @@ again:
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
+ rq->misfit_task = !task_fits_max(p, rq->cpu);
+
return p;
simple:
cfs_rq = &rq->cfs;
@@ -6476,9 +7400,12 @@ simple:
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
+ rq->misfit_task = !task_fits_max(p, rq->cpu);
+
return p;
idle:
+ rq->misfit_task = 0;
/*
* This is OK, because current is on_cpu, which avoids it being picked
* for load-balance and preemption/IRQs are still disabled avoiding
@@ -6691,6 +7618,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
enum fbq_type { regular, remote, all };
+enum group_type {
+ group_other = 0,
+ group_misfit_task,
+ group_imbalanced,
+ group_overloaded,
+};
+
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
@@ -6713,6 +7647,7 @@ struct lb_env {
int new_dst_cpu;
enum cpu_idle_type idle;
long imbalance;
+ unsigned int src_grp_nr_running;
/* The set of CPUs under consideration for load-balancing */
struct cpumask *cpus;
unsigned int busiest_grp_capacity;
@@ -6725,6 +7660,7 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
+ enum group_type busiest_group_type;
struct list_head tasks;
enum sched_boost_policy boost_policy;
};
@@ -7108,6 +8044,10 @@ static void attach_one_task(struct rq *rq, struct task_struct *p)
{
raw_spin_lock(&rq->lock);
attach_task(rq, p);
+ /*
+ * We want to potentially raise target_cpu's OPP.
+ */
+ update_capacity_of(cpu_of(rq));
raw_spin_unlock(&rq->lock);
}
@@ -7129,6 +8069,11 @@ static void attach_tasks(struct lb_env *env)
attach_task(env->dst_rq, p);
}
+ /*
+ * We want to potentially raise env.dst_cpu's OPP.
+ */
+ update_capacity_of(env->dst_cpu);
+
raw_spin_unlock(&env->dst_rq->lock);
}
@@ -7224,12 +8169,6 @@ static unsigned long task_h_load(struct task_struct *p)
/********** Helpers for find_busiest_group ************************/
-enum group_type {
- group_other = 0,
- group_imbalanced,
- group_overloaded,
-};
-
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
@@ -7249,6 +8188,7 @@ struct sg_lb_stats {
unsigned int group_weight;
enum group_type group_type;
int group_no_capacity;
+ int group_misfit_task; /* A cpu has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -7394,19 +8334,57 @@ static unsigned long scale_rt_capacity(int cpu)
used = div_u64(avg, total);
+ /*
+ * deadline bandwidth is defined at system level so we must
+ * weight this bandwidth with the max capacity of the system.
+ * As a reminder, avg_bw is 20bits width and
+ * scale_cpu_capacity is 10 bits width
+ */
+ used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
+
if (likely(used < SCHED_CAPACITY_SCALE))
return SCHED_CAPACITY_SCALE - used;
return 1;
}
+void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
+{
+ raw_spin_lock_init(&mcc->lock);
+ mcc->val = 0;
+ mcc->cpu = -1;
+}
+
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
struct sched_group *sdg = sd->groups;
+ struct max_cpu_capacity *mcc;
+ unsigned long max_capacity;
+ int max_cap_cpu;
+ unsigned long flags;
cpu_rq(cpu)->cpu_capacity_orig = capacity;
+ mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
+
+ raw_spin_lock_irqsave(&mcc->lock, flags);
+ max_capacity = mcc->val;
+ max_cap_cpu = mcc->cpu;
+
+ if ((max_capacity > capacity && max_cap_cpu == cpu) ||
+ (max_capacity < capacity)) {
+ mcc->val = capacity;
+ mcc->cpu = cpu;
+#ifdef CONFIG_SCHED_DEBUG
+ raw_spin_unlock_irqrestore(&mcc->lock, flags);
+ pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+ goto skip_unlock;
+#endif
+ }
+ raw_spin_unlock_irqrestore(&mcc->lock, flags);
+
+skip_unlock: __attribute__ ((unused));
capacity *= scale_rt_capacity(cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
@@ -7415,13 +8393,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
+ sdg->sgc->max_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity;
+ unsigned long capacity, max_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -7434,6 +8413,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
capacity = 0;
+ max_capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -7460,11 +8440,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
*/
if (unlikely(!rq->sd)) {
capacity += capacity_of(cpu);
- continue;
+ } else {
+ sgc = rq->sd->groups->sgc;
+ capacity += sgc->capacity;
}
- sgc = rq->sd->groups->sgc;
- capacity += sgc->capacity;
+ max_capacity = max(capacity, max_capacity);
}
} else {
/*
@@ -7474,16 +8455,21 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
group = child->groups;
do {
+ struct sched_group_capacity *sgc = group->sgc;
+
cpumask_t *cpus = sched_group_cpus(group);
/* Revisit this later. This won't work for MT domain */
- if (!cpu_isolated(cpumask_first(cpus)))
- capacity += group->sgc->capacity;
+ if (!cpu_isolated(cpumask_first(cpus))) {
+ capacity += sgc->capacity;
+ max_capacity = max(sgc->max_capacity, max_capacity);
+ }
group = group->next;
} while (group != child->groups);
}
sdg->sgc->capacity = capacity;
+ sdg->sgc->max_capacity = max_capacity;
}
/*
@@ -7578,6 +8564,18 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
return false;
}
+
+/*
+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-cpu capacity than sched_group ref.
+ */
+static inline bool
+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+ return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE <
+ ref->sgc->max_capacity;
+}
+
static inline enum
group_type group_classify(struct sched_group *group,
struct sg_lb_stats *sgs, struct lb_env *env)
@@ -7588,6 +8586,9 @@ group_type group_classify(struct sched_group *group,
if (sg_imbalanced(group))
return group_imbalanced;
+ if (sgs->group_misfit_task)
+ return group_misfit_task;
+
return group_other;
}
@@ -7599,14 +8600,15 @@ group_type group_classify(struct sched_group *group,
* @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
* @overload: Indicate more than one runnable task for any CPU.
+ * @overutilized: Indicate overutilization for any CPU.
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
int local_group, struct sg_lb_stats *sgs,
- bool *overload)
+ bool *overload, bool *overutilized)
{
unsigned long load;
- int i;
+ int i, nr_running;
memset(sgs, 0, sizeof(*sgs));
@@ -7631,7 +8633,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_util += cpu_util(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
- if (rq->nr_running > 1)
+ nr_running = rq->nr_running;
+ if (nr_running > 1)
*overload = true;
#ifdef CONFIG_SCHED_HMP
@@ -7644,8 +8647,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
sgs->sum_weighted_load += weighted_cpuload(i);
- if (idle_cpu(i))
+ /*
+ * No need to call idle_cpu() if nr_running is not 0
+ */
+ if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
+
+ if (energy_aware() && cpu_overutilized(i)) {
+ *overutilized = true;
+ if (!sgs->group_misfit_task && rq->misfit_task)
+ sgs->group_misfit_task = capacity_of(i);
+ }
}
/* Isolated CPU has no weight */
@@ -7727,8 +8739,26 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->group_type < busiest->group_type)
return false;
- if (sgs->avg_load <= busiest->avg_load)
- return false;
+ if (energy_aware()) {
+ /*
+ * Candidate sg doesn't face any serious load-balance problems
+ * so don't pick it if the local sg is already filled up.
+ */
+ if (sgs->group_type == group_other &&
+ !group_has_capacity(env, &sds->local_stat))
+ return false;
+
+ if (sgs->avg_load <= busiest->avg_load)
+ return false;
+
+ /*
+ * Candiate sg has no more than one task per cpu and has higher
+ * per-cpu capacity. No reason to pull tasks to less capable cpus.
+ */
+ if (sgs->sum_nr_running <= sgs->group_weight &&
+ group_smaller_cpu_capacity(sds->local, sg))
+ return false;
+ }
/* This is the busiest node in its class. */
if (!(env->sd->flags & SD_ASYM_PACKING))
@@ -7791,7 +8821,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
- bool overload = false;
+ bool overload = false, overutilized = false;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
@@ -7813,7 +8843,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
}
update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
- &overload);
+ &overload, &overutilized);
if (local_group)
goto next_group;
@@ -7835,6 +8865,16 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
sgs->group_type = group_classify(sg, sgs, env);
}
+ /*
+ * Ignore task groups with misfit tasks if local group has no
+ * capacity or if per-cpu capacity isn't higher.
+ */
+ if (energy_aware() &&
+ sgs->group_type == group_misfit_task &&
+ (!group_has_capacity(env, &sds->local_stat) ||
+ !group_smaller_cpu_capacity(sg, sds->local)))
+ sgs->group_type = group_other;
+
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
sds->busiest_stat = *sgs;
@@ -7853,10 +8893,23 @@ next_group:
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+ env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
+
if (!env->sd->parent) {
/* update overload indicator if we are at root domain */
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
+
+ /* Update over-utilization (tipping point, U >= 0) indicator */
+ if (energy_aware() && env->dst_rq->rd->overutilized != overutilized) {
+ env->dst_rq->rd->overutilized = overutilized;
+ trace_sched_overutilized(overutilized);
+ }
+ } else {
+ if (energy_aware() && !env->dst_rq->rd->overutilized && overutilized) {
+ env->dst_rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
}
}
@@ -8005,6 +9058,24 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
if (busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load) {
+ if (energy_aware()) {
+ /* Misfitting tasks should be migrated in any case */
+ if (busiest->group_type == group_misfit_task) {
+ env->imbalance = busiest->group_misfit_task;
+ return;
+ }
+
+ /*
+ * Busiest group is overloaded, local is not, use the spare
+ * cycles to maximize throughput
+ */
+ if (busiest->group_type == group_overloaded &&
+ local->group_type <= group_misfit_task) {
+ env->imbalance = busiest->load_per_task;
+ return;
+ }
+ }
+
env->imbalance = 0;
return fix_small_imbalance(env, sds);
}
@@ -8038,6 +9109,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
+ /* Boost imbalance to allow misfit task to be balanced. */
+ if (energy_aware() && busiest->group_type == group_misfit_task)
+ env->imbalance = max_t(long, env->imbalance,
+ busiest->group_misfit_task);
+
/*
* if *imbalance is less than the average load per runnable task
* there is no guarantee that any tasks will be moved so we'll have
@@ -8079,6 +9155,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* this level.
*/
update_sd_lb_stats(env, &sds);
+
+ if (energy_aware() && !env->dst_rq->rd->overutilized)
+ goto out_balanced;
+
local = &sds.local_stat;
busiest = &sds.busiest_stat;
@@ -8113,6 +9193,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
busiest->group_no_capacity)
goto force_balance;
+ /* Misfitting tasks should be dealt with regardless of the avg load */
+ if (energy_aware() && busiest->group_type == group_misfit_task) {
+ goto force_balance;
+ }
+
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
@@ -8136,7 +9221,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* might end up to just move the imbalance on another group
*/
if ((busiest->group_type != group_overloaded) &&
- (local->idle_cpus <= (busiest->idle_cpus + 1)))
+ (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
+ !group_smaller_cpu_capacity(sds.busiest, sds.local))
goto out_balanced;
} else {
/*
@@ -8149,6 +9235,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
}
force_balance:
+ env->busiest_group_type = busiest->group_type;
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(env, &sds);
return sds.busiest;
@@ -8264,7 +9351,8 @@ static struct rq *find_busiest_queue(struct lb_env *env,
*/
if (rq->nr_running == 1 && wl > env->imbalance &&
- !check_cpu_capacity(rq, env->sd))
+ !check_cpu_capacity(rq, env->sd) &&
+ env->busiest_group_type != group_misfit_task)
continue;
/*
@@ -8330,6 +9418,14 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
+ if (energy_aware() &&
+ (capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+ env->src_rq->cfs.h_nr_running == 1 &&
+ cpu_overutilized(env->src_cpu) &&
+ !cpu_overutilized(env->dst_cpu)) {
+ return 1;
+ }
+
return unlikely(sd->nr_balance_failed >
sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD);
}
@@ -8473,6 +9569,11 @@ more_balance:
* ld_moved - cumulative load moved across iterations
*/
cur_ld_moved = detach_tasks(&env);
+ /*
+ * We want to potentially lower env.src_cpu's OPP.
+ */
+ if (cur_ld_moved)
+ update_capacity_of(env.src_cpu);
/*
* We've detached some tasks from busiest_rq. Every
@@ -8567,8 +9668,10 @@ no_move:
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE &&
- !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE))
- sd->nr_balance_failed++;
+ !(env.flags & LBF_BIG_TASK_ACTIVE_BALANCE)) {
+ if (env.src_grp_nr_running > 1)
+ sd->nr_balance_failed++;
+ }
if (need_active_balance(&env)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
@@ -8720,6 +9823,7 @@ static int idle_balance(struct rq *this_rq)
struct sched_domain *sd;
int pulled_task = 0;
u64 curr_cost = 0;
+ long removed_util=0;
if (cpu_isolated(this_cpu))
return 0;
@@ -8732,8 +9836,9 @@ static int idle_balance(struct rq *this_rq)
*/
this_rq->idle_stamp = rq_clock(this_rq);
- if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !this_rq->rd->overload) {
+ if (!energy_aware() &&
+ (this_rq->avg_idle < sysctl_sched_migration_cost ||
+ !this_rq->rd->overload)) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
@@ -8745,6 +9850,17 @@ static int idle_balance(struct rq *this_rq)
raw_spin_unlock(&this_rq->lock);
+ /*
+ * If removed_util_avg is !0 we most probably migrated some task away
+ * from this_cpu. In this case we might be willing to trigger an OPP
+ * update, but we want to do so if we don't find anybody else to pull
+ * here (we will trigger an OPP update with the pulled task's enqueue
+ * anyway).
+ *
+ * Record removed_util before calling update_blocked_averages, and use
+ * it below (before returning) to see if an OPP update is required.
+ */
+ removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg);
update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
@@ -8812,6 +9928,12 @@ out:
if (pulled_task) {
idle_exit_fair(this_rq);
this_rq->idle_stamp = 0;
+ } else if (removed_util) {
+ /*
+ * No task pulled and someone has been migrated away.
+ * Good case to trigger an OPP update.
+ */
+ update_capacity_of(this_cpu);
}
return pulled_task;
@@ -8895,6 +10017,10 @@ static int active_load_balance_cpu_stop(void *data)
p = detach_one_task(&env);
if (p) {
schedstat_inc(sd, alb_pushed);
+ /*
+ * We want to potentially lower env.src_cpu's OPP.
+ */
+ update_capacity_of(env.src_cpu);
moved = true;
} else {
schedstat_inc(sd, alb_failed);
@@ -9372,6 +10498,10 @@ static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type)
if (time_before(now, nohz.next_balance))
return 0;
+ if (rq->nr_running >= 2 &&
+ (!energy_aware() || cpu_overutilized(cpu)))
+ return true;
+
return (rq->nr_running >= 2);
}
@@ -9412,7 +10542,7 @@ static inline bool nohz_kick_needed(struct rq *rq, int *type)
#ifndef CONFIG_SCHED_HMP
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu));
- if (sd) {
+ if (sd && !energy_aware()) {
sgc = sd->groups->sgc;
nr_busy = atomic_read(&sgc->nr_busy_cpus);
@@ -9523,6 +10653,17 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
+
+#ifdef CONFIG_SMP
+ if (energy_aware() &&
+ !rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
+ rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
+
+ rq->misfit_task = !task_fits_max(curr, rq->cpu);
+#endif
+
}
/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index acee1854c3d0..7cc74e56fde4 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -68,3 +68,9 @@ SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
+
+/*
+ * Energy aware scheduling. Use platform energy model to guide scheduling
+ * decisions optimizing for energy efficiency.
+ */
+SCHED_FEAT(ENERGY_AWARE, false)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2489140a7c51..917c94abf5bb 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -19,9 +19,10 @@
* sched_idle_set_state - Record idle state for the current CPU.
* @idle_state: State to record.
*/
-void sched_idle_set_state(struct cpuidle_state *idle_state)
+void sched_idle_set_state(struct cpuidle_state *idle_state, int index)
{
idle_set_state(this_rq(), idle_state);
+ idle_set_state_idx(this_rq(), index);
}
static int __read_mostly cpu_idle_force_poll;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 52edd6b158ed..f8bc34c31c42 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1536,6 +1536,41 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
#endif
}
+#if defined(CONFIG_SMP) && defined(CONFIG_CPU_FREQ_GOV_SCHED)
+static void sched_rt_update_capacity_req(struct rq *rq)
+{
+ u64 total, used, age_stamp, avg;
+ s64 delta;
+
+ if (!sched_freq())
+ return;
+
+ sched_avg_update(rq);
+ /*
+ * Since we're reading these variables without serialization make sure
+ * we read them once before doing sanity checks on them.
+ */
+ age_stamp = READ_ONCE(rq->age_stamp);
+ avg = READ_ONCE(rq->rt_avg);
+ delta = rq_clock(rq) - age_stamp;
+
+ if (unlikely(delta < 0))
+ delta = 0;
+
+ total = sched_avg_period() + delta;
+
+ used = div_u64(avg, total);
+ if (unlikely(used > SCHED_CAPACITY_SCALE))
+ used = SCHED_CAPACITY_SCALE;
+
+ set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used));
+}
+#else
+static inline void sched_rt_update_capacity_req(struct rq *rq)
+{ }
+
+#endif
+
static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
struct rt_rq *rt_rq)
{
@@ -1604,8 +1639,17 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
if (prev->sched_class == &rt_sched_class)
update_curr_rt(rq);
- if (!rt_rq->rt_queued)
+ if (!rt_rq->rt_queued) {
+ /*
+ * The next task to be picked on this rq will have a lower
+ * priority than rt tasks so we can spend some time to update
+ * the capacity used by rt tasks based on the last activity.
+ * This value will be the used as an estimation of the next
+ * activity.
+ */
+ sched_rt_update_capacity_req(rq);
return NULL;
+ }
put_prev_task(rq, prev);
@@ -2409,6 +2453,9 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
update_curr_rt(rq);
+ if (rq->rt.rt_nr_running)
+ sched_rt_update_capacity_req(rq);
+
watchdog(rq, p);
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f569c6fe3cbb..f3f10c4b7a22 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -596,10 +596,18 @@ struct dl_rq {
#else
struct dl_bw dl_bw;
#endif
+ /* This is the "average utilization" for this runqueue */
+ s64 avg_bw;
};
#ifdef CONFIG_SMP
+struct max_cpu_capacity {
+ raw_spinlock_t lock;
+ unsigned long val;
+ int cpu;
+};
+
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
@@ -618,6 +626,9 @@ struct root_domain {
/* Indicate more than one runnable task for any CPU */
bool overload;
+ /* Indicate one or more cpus over-utilized (tipping point) */
+ bool overutilized;
+
/*
* The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks).
@@ -633,6 +644,9 @@ struct root_domain {
*/
cpumask_var_t rto_mask;
struct cpupri cpupri;
+
+ /* Maximum cpu capacity in the system. */
+ struct max_cpu_capacity max_cpu_capacity;
};
extern struct root_domain def_root_domain;
@@ -662,6 +676,7 @@ struct rq {
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned long last_load_update_tick;
+ unsigned int misfit_task;
#ifdef CONFIG_NO_HZ_COMMON
u64 nohz_stamp;
unsigned long nohz_flags;
@@ -669,6 +684,14 @@ struct rq {
#ifdef CONFIG_NO_HZ_FULL
unsigned long last_sched_tick;
#endif
+
+#ifdef CONFIG_CPU_QUIET
+ /* time-based average load */
+ u64 nr_last_stamp;
+ u64 nr_running_integral;
+ seqcount_t ave_seqcnt;
+#endif
+
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
@@ -808,6 +831,7 @@ struct rq {
#ifdef CONFIG_CPU_IDLE
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
+ int idle_state_idx;
#endif
};
@@ -957,6 +981,8 @@ DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_busy);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+DECLARE_PER_CPU(struct sched_domain *, sd_ea);
+DECLARE_PER_CPU(struct sched_domain *, sd_scs);
struct sched_group_capacity {
atomic_t ref;
@@ -964,7 +990,8 @@ struct sched_group_capacity {
* CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
* for a single CPU.
*/
- unsigned int capacity;
+ unsigned long capacity;
+ unsigned long max_capacity; /* Max per-cpu capacity in group */
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
/*
@@ -981,6 +1008,7 @@ struct sched_group {
unsigned int group_weight;
struct sched_group_capacity *sgc;
+ const struct sched_group_energy const *sge;
/*
* The CPUs this group covers.
@@ -1913,6 +1941,7 @@ static const u32 prio_to_wmult[40] = {
#endif
#define ENQUEUE_REPLENISH 0x08
#define ENQUEUE_RESTORE 0x10
+#define ENQUEUE_WAKEUP_NEW 0x20
#define DEQUEUE_SLEEP 0x01
#define DEQUEUE_SAVE 0x02
@@ -2004,6 +2033,7 @@ extern const struct sched_class idle_sched_class;
#ifdef CONFIG_SMP
+extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc);
extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
@@ -2033,6 +2063,17 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
WARN_ON(!rcu_read_lock_held());
return rq->idle_state;
}
+
+static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx)
+{
+ rq->idle_state_idx = idle_state_idx;
+}
+
+static inline int idle_get_state_idx(struct rq *rq)
+{
+ WARN_ON(!rcu_read_lock_held());
+ return rq->idle_state_idx;
+}
#else
static inline void idle_set_state(struct rq *rq,
struct cpuidle_state *idle_state)
@@ -2043,6 +2084,15 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
return NULL;
}
+
+static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx)
+{
+}
+
+static inline int idle_get_state_idx(struct rq *rq)
+{
+ return -1;
+}
#endif
#ifdef CONFIG_SYSRQ_SCHED_DEBUG
@@ -2069,7 +2119,7 @@ unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
-static inline void add_nr_running(struct rq *rq, unsigned count)
+static inline void __add_nr_running(struct rq *rq, unsigned count)
{
unsigned prev_nr = rq->nr_running;
@@ -2098,12 +2148,49 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
}
}
-static inline void sub_nr_running(struct rq *rq, unsigned count)
+static inline void __sub_nr_running(struct rq *rq, unsigned count)
{
sched_update_nr_prod(cpu_of(rq), count, false);
rq->nr_running -= count;
}
+#ifdef CONFIG_CPU_QUIET
+#define NR_AVE_SCALE(x) ((x) << FSHIFT)
+static inline u64 do_nr_running_integral(struct rq *rq)
+{
+ s64 nr, deltax;
+ u64 nr_running_integral = rq->nr_running_integral;
+
+ deltax = rq->clock_task - rq->nr_last_stamp;
+ nr = NR_AVE_SCALE(rq->nr_running);
+
+ nr_running_integral += nr * deltax;
+
+ return nr_running_integral;
+}
+
+static inline void add_nr_running(struct rq *rq, unsigned count)
+{
+ write_seqcount_begin(&rq->ave_seqcnt);
+ rq->nr_running_integral = do_nr_running_integral(rq);
+ rq->nr_last_stamp = rq->clock_task;
+ __add_nr_running(rq, count);
+ write_seqcount_end(&rq->ave_seqcnt);
+}
+
+static inline void sub_nr_running(struct rq *rq, unsigned count)
+{
+ write_seqcount_begin(&rq->ave_seqcnt);
+ rq->nr_running_integral = do_nr_running_integral(rq);
+ rq->nr_last_stamp = rq->clock_task;
+ __sub_nr_running(rq, count);
+ write_seqcount_end(&rq->ave_seqcnt);
+}
+#else
+#define add_nr_running __add_nr_running
+#define sub_nr_running __sub_nr_running
+#endif
+
static inline void rq_last_tick_reset(struct rq *rq)
{
#ifdef CONFIG_NO_HZ_FULL
@@ -2176,10 +2263,137 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
}
#endif
+#ifdef CONFIG_SMP
+static inline unsigned long capacity_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity;
+}
+
+static inline unsigned long capacity_orig_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int walt_ravg_window;
+extern unsigned int walt_disabled;
+
+/*
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the utilization with the capacity of the CPU that is available for
+ * CFS task (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
+ */
+static inline unsigned long __cpu_util(int cpu, int delta)
+{
+ unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+ unsigned long capacity = capacity_orig_of(cpu);
+
+ delta += util;
+ if (delta < 0)
+ return 0;
+
+ return (delta >= capacity) ? capacity : delta;
+}
+
+static inline unsigned long cpu_util(int cpu)
+{
+ return __cpu_util(cpu, 0);
+}
+
+#endif
+
+#ifdef CONFIG_CPU_FREQ_GOV_SCHED
+#define capacity_max SCHED_CAPACITY_SCALE
+extern unsigned int capacity_margin;
+extern struct static_key __sched_freq;
+
+static inline bool sched_freq(void)
+{
+ return static_key_false(&__sched_freq);
+}
+
+DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);
+void update_cpu_capacity_request(int cpu, bool request);
+
+static inline void set_cfs_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{
+ struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu);
+
+ if (scr->cfs != capacity) {
+ scr->cfs = capacity;
+ update_cpu_capacity_request(cpu, request);
+ }
+}
+
+static inline void set_rt_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{
+ if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) {
+ per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity;
+ update_cpu_capacity_request(cpu, request);
+ }
+}
+
+static inline void set_dl_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{
+ if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) {
+ per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity;
+ update_cpu_capacity_request(cpu, request);
+ }
+}
+#else
+#define sched_freq() false
+static inline void set_cfs_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{ }
+static inline void set_rt_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{ }
+static inline void set_dl_cpu_capacity(int cpu, bool request,
+ unsigned long capacity)
+{ }
+#endif
+
+#ifdef CONFIG_SCHED_HMP
+/*
+ * HMP and EAS are orthogonal. Hopefully the compiler just elides out all code
+ * with the energy_aware() check, so that we don't even pay the comparison
+ * penalty at runtime.
+ */
+#define energy_aware() false
+#else
+static inline bool energy_aware(void)
+{
+ return sched_feat(ENERGY_AWARE);
+}
+#endif
+
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
- sched_avg_update(rq);
}
#else
static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
@@ -2268,6 +2482,9 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
+extern struct rq *lock_rq_of(struct task_struct *p, unsigned long *flags);
+extern void unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags);
+
#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT
@@ -2340,7 +2557,8 @@ static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
- raw_spin_unlock(&busiest->lock);
+ if (this_rq != busiest)
+ raw_spin_unlock(&busiest->lock);
lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
}
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
index ee2af8e0b5ce..3c2e21c5a5a0 100644
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -1,13 +1,108 @@
#include <linux/cgroup.h>
#include <linux/err.h>
+#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/printk.h>
+#include <linux/rcupdate.h>
#include <linux/slab.h>
+#include <trace/events/sched.h>
+
#include "sched.h"
+#include "tune.h"
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+static bool schedtune_initialized = false;
+#endif
unsigned int sysctl_sched_cfs_boost __read_mostly;
+extern struct target_nrg schedtune_target_nrg;
+
+/* Performance Boost region (B) threshold params */
+static int perf_boost_idx;
+
+/* Performance Constraint region (C) threshold params */
+static int perf_constrain_idx;
+
+/**
+ * Performance-Energy (P-E) Space thresholds constants
+ */
+struct threshold_params {
+ int nrg_gain;
+ int cap_gain;
+};
+
+/*
+ * System specific P-E space thresholds constants
+ */
+static struct threshold_params
+threshold_gains[] = {
+ { 0, 5 }, /* < 10% */
+ { 1, 5 }, /* < 20% */
+ { 2, 5 }, /* < 30% */
+ { 3, 5 }, /* < 40% */
+ { 4, 5 }, /* < 50% */
+ { 5, 4 }, /* < 60% */
+ { 5, 3 }, /* < 70% */
+ { 5, 2 }, /* < 80% */
+ { 5, 1 }, /* < 90% */
+ { 5, 0 } /* <= 100% */
+};
+
+static int
+__schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ int perf_boost_idx, int perf_constrain_idx)
+{
+ int payoff = -INT_MAX;
+ int gain_idx = -1;
+
+ /* Performance Boost (B) region */
+ if (nrg_delta >= 0 && cap_delta > 0)
+ gain_idx = perf_boost_idx;
+ /* Performance Constraint (C) region */
+ else if (nrg_delta < 0 && cap_delta <= 0)
+ gain_idx = perf_constrain_idx;
+
+ /* Default: reject schedule candidate */
+ if (gain_idx == -1)
+ return payoff;
+
+ /*
+ * Evaluate "Performance Boost" vs "Energy Increase"
+ *
+ * - Performance Boost (B) region
+ *
+ * Condition: nrg_delta > 0 && cap_delta > 0
+ * Payoff criteria:
+ * cap_gain / nrg_gain < cap_delta / nrg_delta =
+ * cap_gain * nrg_delta < cap_delta * nrg_gain
+ * Note that since both nrg_gain and nrg_delta are positive, the
+ * inequality does not change. Thus:
+ *
+ * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
+ *
+ * - Performance Constraint (C) region
+ *
+ * Condition: nrg_delta < 0 && cap_delta < 0
+ * payoff criteria:
+ * cap_gain / nrg_gain > cap_delta / nrg_delta =
+ * cap_gain * nrg_delta < cap_delta * nrg_gain
+ * Note that since nrg_gain > 0 while nrg_delta < 0, the
+ * inequality change. Thus:
+ *
+ * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
+ *
+ * This means that, in case of same positive defined {cap,nrg}_gain
+ * for both the B and C regions, we can use the same payoff formula
+ * where a positive value represents the accept condition.
+ */
+ payoff = cap_delta * threshold_gains[gain_idx].nrg_gain;
+ payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain;
+
+ return payoff;
+}
+
#ifdef CONFIG_CGROUP_SCHEDTUNE
/*
@@ -52,6 +147,15 @@ struct schedtune {
bool colocate_update_disabled;
#endif
+ /* Performance Boost (B) region threshold params */
+ int perf_boost_idx;
+
+ /* Performance Constraint (C) region threshold params */
+ int perf_constrain_idx;
+
+ /* Hint to bias scheduling of tasks on that SchedTune CGroup
+ * towards idle CPUs */
+ int prefer_idle;
};
static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
@@ -88,8 +192,42 @@ root_schedtune = {
.colocate = false,
.colocate_update_disabled = false,
#endif
+ .perf_boost_idx = 0,
+ .perf_constrain_idx = 0,
+ .prefer_idle = 0,
};
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ struct task_struct *task)
+{
+ struct schedtune *ct;
+ int perf_boost_idx;
+ int perf_constrain_idx;
+
+ /* Optimal (O) region */
+ if (nrg_delta < 0 && cap_delta > 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
+ return INT_MAX;
+ }
+
+ /* Suboptimal (S) region */
+ if (nrg_delta > 0 && cap_delta < 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
+ return -INT_MAX;
+ }
+
+ /* Get task specific perf Boost/Constraints indexes */
+ rcu_read_lock();
+ ct = task_schedtune(task);
+ perf_boost_idx = ct->perf_boost_idx;
+ perf_constrain_idx = ct->perf_constrain_idx;
+ rcu_read_unlock();
+
+ return __schedtune_accept_deltas(nrg_delta, cap_delta,
+ perf_boost_idx, perf_constrain_idx);
+}
+
/*
* Maximum number of boost groups to support
* When per-task boosting is used we still allow only limited number of
@@ -119,13 +257,16 @@ static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
*/
struct boost_groups {
/* Maximum boost value for all RUNNABLE tasks on a CPU */
- unsigned boost_max;
+ bool idle;
+ int boost_max;
struct {
/* The boost for tasks on that boost group */
- unsigned boost;
+ int boost;
/* Count of RUNNABLE tasks on that boost group */
unsigned tasks;
} group[BOOSTGROUPS_COUNT];
+ /* CPU's boost group locking */
+ raw_spinlock_t lock;
};
/* Boost groups affecting each CPU in the system */
@@ -246,7 +387,342 @@ static inline void init_sched_boost(struct schedtune *st) { }
#endif /* CONFIG_SCHED_HMP */
+static void
+schedtune_cpu_update(int cpu)
+{
+ struct boost_groups *bg;
+ int boost_max;
+ int idx;
+
+ bg = &per_cpu(cpu_boost_groups, cpu);
+
+ /* The root boost group is always active */
+ boost_max = bg->group[0].boost;
+ for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
+ /*
+ * A boost group affects a CPU only if it has
+ * RUNNABLE tasks on that CPU
+ */
+ if (bg->group[idx].tasks == 0)
+ continue;
+
+ boost_max = max(boost_max, bg->group[idx].boost);
+ }
+ /* Ensures boost_max is non-negative when all cgroup boost values
+ * are neagtive. Avoids under-accounting of cpu capacity which may cause
+ * task stacking and frequency spikes.*/
+ boost_max = max(boost_max, 0);
+ bg->boost_max = boost_max;
+}
+
+static int
+schedtune_boostgroup_update(int idx, int boost)
+{
+ struct boost_groups *bg;
+ int cur_boost_max;
+ int old_boost;
+ int cpu;
+
+ /* Update per CPU boost groups */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+
+ /*
+ * Keep track of current boost values to compute the per CPU
+ * maximum only when it has been affected by the new value of
+ * the updated boost group
+ */
+ cur_boost_max = bg->boost_max;
+ old_boost = bg->group[idx].boost;
+
+ /* Update the boost value of this boost group */
+ bg->group[idx].boost = boost;
+
+ /* Check if this update increase current max */
+ if (boost > cur_boost_max && bg->group[idx].tasks) {
+ bg->boost_max = boost;
+ trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
+ continue;
+ }
+
+ /* Check if this update has decreased current max */
+ if (cur_boost_max == old_boost && old_boost > boost) {
+ schedtune_cpu_update(cpu);
+ trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
+ continue;
+ }
+
+ trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max);
+ }
+
+ return 0;
+}
+
+#define ENQUEUE_TASK 1
+#define DEQUEUE_TASK -1
+
+static inline void
+schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
+{
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ int tasks = bg->group[idx].tasks + task_count;
+
+ /* Update boosted tasks count while avoiding to make it negative */
+ bg->group[idx].tasks = max(0, tasks);
+
+ trace_sched_tune_tasks_update(p, cpu, tasks, idx,
+ bg->group[idx].boost, bg->boost_max);
+
+ /* Boost group activation or deactivation on that RQ */
+ if (tasks == 1 || tasks == 0)
+ schedtune_cpu_update(cpu);
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_enqueue_task(struct task_struct *p, int cpu)
+{
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ unsigned long irq_flags;
+ struct schedtune *st;
+ int idx;
+
+ if (!unlikely(schedtune_initialized))
+ return;
+
+ /*
+ * When a task is marked PF_EXITING by do_exit() it's going to be
+ * dequeued and enqueued multiple times in the exit path.
+ * Thus we avoid any further update, since we do not want to change
+ * CPU boosting while the task is exiting.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /*
+ * Boost group accouting is protected by a per-cpu lock and requires
+ * interrupt to be disabled to avoid race conditions for example on
+ * do_exit()::cgroup_exit() and task migration.
+ */
+ raw_spin_lock_irqsave(&bg->lock, irq_flags);
+ rcu_read_lock();
+
+ st = task_schedtune(p);
+ idx = st->idx;
+
+ schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
+
+ rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+int schedtune_allow_attach(struct cgroup_taskset *tset)
+{
+ /* We always allows tasks to be moved between existing CGroups */
+ return 0;
+}
+
+int schedtune_can_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
+ struct boost_groups *bg;
+ unsigned long irq_flags;
+ unsigned int cpu;
+ struct rq *rq;
+ int src_bg; /* Source boost group index */
+ int dst_bg; /* Destination boost group index */
+ int tasks;
+
+ if (!unlikely(schedtune_initialized))
+ return 0;
+
+
+ cgroup_taskset_for_each(task, css, tset) {
+
+ /*
+ * Lock the CPU's RQ the task is enqueued to avoid race
+ * conditions with migration code while the task is being
+ * accounted
+ */
+ rq = lock_rq_of(task, &irq_flags);
+
+ if (!task->on_rq) {
+ unlock_rq_of(rq, task, &irq_flags);
+ continue;
+ }
+
+ /*
+ * Boost group accouting is protected by a per-cpu lock and requires
+ * interrupt to be disabled to avoid race conditions on...
+ */
+ cpu = cpu_of(rq);
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ raw_spin_lock(&bg->lock);
+
+ dst_bg = css_st(css)->idx;
+ src_bg = task_schedtune(task)->idx;
+
+ /*
+ * Current task is not changing boostgroup, which can
+ * happen when the new hierarchy is in use.
+ */
+ if (unlikely(dst_bg == src_bg)) {
+ raw_spin_unlock(&bg->lock);
+ unlock_rq_of(rq, task, &irq_flags);
+ continue;
+ }
+
+ /*
+ * This is the case of a RUNNABLE task which is switching its
+ * current boost group.
+ */
+
+ /* Move task from src to dst boost group */
+ tasks = bg->group[src_bg].tasks - 1;
+ bg->group[src_bg].tasks = max(0, tasks);
+ bg->group[dst_bg].tasks += 1;
+
+ raw_spin_unlock(&bg->lock);
+ unlock_rq_of(rq, task, &irq_flags);
+
+ /* Update CPU boost group */
+ if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
+ schedtune_cpu_update(task_cpu(task));
+
+ }
+
+ return 0;
+}
+
+void schedtune_cancel_attach(struct cgroup_taskset *tset)
+{
+ /* This can happen only if SchedTune controller is mounted with
+ * other hierarchies ane one of them fails. Since usually SchedTune is
+ * mouted on its own hierarcy, for the time being we do not implement
+ * a proper rollback mechanism */
+ WARN(1, "SchedTune cancel attach not implemented");
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_dequeue_task(struct task_struct *p, int cpu)
+{
+ struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+ unsigned long irq_flags;
+ struct schedtune *st;
+ int idx;
+
+ if (!unlikely(schedtune_initialized))
+ return;
+
+ /*
+ * When a task is marked PF_EXITING by do_exit() it's going to be
+ * dequeued and enqueued multiple times in the exit path.
+ * Thus we avoid any further update, since we do not want to change
+ * CPU boosting while the task is exiting.
+ * The last dequeue is already enforce by the do_exit() code path
+ * via schedtune_exit_task().
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /*
+ * Boost group accouting is protected by a per-cpu lock and requires
+ * interrupt to be disabled to avoid race conditions on...
+ */
+ raw_spin_lock_irqsave(&bg->lock, irq_flags);
+ rcu_read_lock();
+
+ st = task_schedtune(p);
+ idx = st->idx;
+
+ schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
+
+ rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+void schedtune_exit_task(struct task_struct *tsk)
+{
+ struct schedtune *st;
+ unsigned long irq_flags;
+ unsigned int cpu;
+ struct rq *rq;
+ int idx;
+
+ if (!unlikely(schedtune_initialized))
+ return;
+
+ rq = lock_rq_of(tsk, &irq_flags);
+ rcu_read_lock();
+
+ cpu = cpu_of(rq);
+ st = task_schedtune(tsk);
+ idx = st->idx;
+ schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK);
+
+ rcu_read_unlock();
+ unlock_rq_of(rq, tsk, &irq_flags);
+}
+
+int schedtune_cpu_boost(int cpu)
+{
+ struct boost_groups *bg;
+
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ return bg->boost_max;
+}
+
+int schedtune_task_boost(struct task_struct *p)
+{
+ struct schedtune *st;
+ int task_boost;
+
+ /* Get task boost value */
+ rcu_read_lock();
+ st = task_schedtune(p);
+ task_boost = st->boost;
+ rcu_read_unlock();
+
+ return task_boost;
+}
+
+int schedtune_prefer_idle(struct task_struct *p)
+{
+ struct schedtune *st;
+ int prefer_idle;
+
+ /* Get prefer_idle value */
+ rcu_read_lock();
+ st = task_schedtune(p);
+ prefer_idle = st->prefer_idle;
+ rcu_read_unlock();
+
+ return prefer_idle;
+}
+
static u64
+prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct schedtune *st = css_st(css);
+
+ return st->prefer_idle;
+}
+
+static int
+prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 prefer_idle)
+{
+ struct schedtune *st = css_st(css);
+ st->prefer_idle = prefer_idle;
+
+ return 0;
+}
+
+static s64
boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
struct schedtune *st = css_st(css);
@@ -256,16 +732,37 @@ boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
static int
boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
- u64 boost)
+ s64 boost)
{
struct schedtune *st = css_st(css);
+ unsigned threshold_idx;
+ int boost_pct;
- if (boost < 0 || boost > 100)
+ if (boost < -100 || boost > 100)
return -EINVAL;
+ boost_pct = boost;
+
+ /*
+ * Update threshold params for Performance Boost (B)
+ * and Performance Constraint (C) regions.
+ * The current implementatio uses the same cuts for both
+ * B and C regions.
+ */
+ threshold_idx = clamp(boost_pct, 0, 99) / 10;
+ st->perf_boost_idx = threshold_idx;
+ st->perf_constrain_idx = threshold_idx;
st->boost = boost;
- if (css == &root_schedtune.css)
+ if (css == &root_schedtune.css) {
sysctl_sched_cfs_boost = boost;
+ perf_boost_idx = threshold_idx;
+ perf_constrain_idx = threshold_idx;
+ }
+
+ /* Update CPU boost */
+ schedtune_boostgroup_update(st->idx, st->boost);
+
+ trace_sched_tune_config(st->boost);
return 0;
}
@@ -289,8 +786,13 @@ static void schedtune_attach(struct cgroup_taskset *tset)
static struct cftype files[] = {
{
.name = "boost",
- .read_u64 = boost_read,
- .write_u64 = boost_write,
+ .read_s64 = boost_read,
+ .write_s64 = boost_write,
+ },
+ {
+ .name = "prefer_idle",
+ .read_u64 = prefer_idle_read,
+ .write_u64 = prefer_idle_write,
},
#ifdef CONFIG_SCHED_HMP
{
@@ -315,26 +817,19 @@ static struct cftype files[] = {
static int
schedtune_boostgroup_init(struct schedtune *st)
{
- /* Keep track of allocated boost groups */
- allocated_group[st->idx] = st;
-
- return 0;
-}
-
-static int
-schedtune_init(void)
-{
struct boost_groups *bg;
int cpu;
+ /* Keep track of allocated boost groups */
+ allocated_group[st->idx] = st;
+
/* Initialize the per CPU boost groups */
for_each_possible_cpu(cpu) {
bg = &per_cpu(cpu_boost_groups, cpu);
- memset(bg, 0, sizeof(struct boost_groups));
+ bg->group[st->idx].boost = 0;
+ bg->group[st->idx].tasks = 0;
}
- pr_info(" schedtune configured to support %d boost groups\n",
- BOOSTGROUPS_COUNT);
return 0;
}
@@ -344,10 +839,8 @@ schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
struct schedtune *st;
int idx;
- if (!parent_css) {
- schedtune_init();
+ if (!parent_css)
return &root_schedtune.css;
- }
/* Allow only single level hierachies */
if (parent_css != &root_schedtune.css) {
@@ -386,6 +879,9 @@ out:
static void
schedtune_boostgroup_release(struct schedtune *st)
{
+ /* Reset this boost group */
+ schedtune_boostgroup_update(st->idx, 0);
+
/* Keep track of allocated boost groups */
allocated_group[st->idx] = NULL;
}
@@ -402,12 +898,55 @@ schedtune_css_free(struct cgroup_subsys_state *css)
struct cgroup_subsys schedtune_cgrp_subsys = {
.css_alloc = schedtune_css_alloc,
.css_free = schedtune_css_free,
+ .allow_attach = schedtune_allow_attach,
+ .can_attach = schedtune_can_attach,
+ .cancel_attach = schedtune_cancel_attach,
.legacy_cftypes = files,
.early_init = 1,
.allow_attach = subsys_cgroup_allow_attach,
.attach = schedtune_attach,
};
+static inline void
+schedtune_init_cgroups(void)
+{
+ struct boost_groups *bg;
+ int cpu;
+
+ /* Initialize the per CPU boost groups */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ memset(bg, 0, sizeof(struct boost_groups));
+ }
+
+ pr_info("schedtune: configured to support %d boost groups\n",
+ BOOSTGROUPS_COUNT);
+
+ schedtune_initialized = true;
+}
+
+#else /* CONFIG_CGROUP_SCHEDTUNE */
+
+int
+schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ struct task_struct *task)
+{
+ /* Optimal (O) region */
+ if (nrg_delta < 0 && cap_delta > 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
+ return INT_MAX;
+ }
+
+ /* Suboptimal (S) region */
+ if (nrg_delta > 0 && cap_delta < 0) {
+ trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
+ return -INT_MAX;
+ }
+
+ return __schedtune_accept_deltas(nrg_delta, cap_delta,
+ perf_boost_idx, perf_constrain_idx);
+}
+
#endif /* CONFIG_CGROUP_SCHEDTUNE */
int
@@ -416,10 +955,183 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ unsigned threshold_idx;
+ int boost_pct;
if (ret || !write)
return ret;
+ if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100)
+ return -EINVAL;
+ boost_pct = sysctl_sched_cfs_boost;
+
+ /*
+ * Update threshold params for Performance Boost (B)
+ * and Performance Constraint (C) regions.
+ * The current implementatio uses the same cuts for both
+ * B and C regions.
+ */
+ threshold_idx = clamp(boost_pct, 0, 99) / 10;
+ perf_boost_idx = threshold_idx;
+ perf_constrain_idx = threshold_idx;
+
return 0;
}
+#ifdef CONFIG_SCHED_DEBUG
+static void
+schedtune_test_nrg(unsigned long delta_pwr)
+{
+ unsigned long test_delta_pwr;
+ unsigned long test_norm_pwr;
+ int idx;
+
+ /*
+ * Check normalization constants using some constant system
+ * energy values
+ */
+ pr_info("schedtune: verify normalization constants...\n");
+ for (idx = 0; idx < 6; ++idx) {
+ test_delta_pwr = delta_pwr >> idx;
+
+ /* Normalize on max energy for target platform */
+ test_norm_pwr = reciprocal_divide(
+ test_delta_pwr << SCHED_LOAD_SHIFT,
+ schedtune_target_nrg.rdiv);
+
+ pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n",
+ idx, test_delta_pwr, test_norm_pwr);
+ }
+}
+#else
+#define schedtune_test_nrg(delta_pwr)
+#endif
+
+/*
+ * Compute the min/max power consumption of a cluster and all its CPUs
+ */
+static void
+schedtune_add_cluster_nrg(
+ struct sched_domain *sd,
+ struct sched_group *sg,
+ struct target_nrg *ste)
+{
+ struct sched_domain *sd2;
+ struct sched_group *sg2;
+
+ struct cpumask *cluster_cpus;
+ char str[32];
+
+ unsigned long min_pwr;
+ unsigned long max_pwr;
+ int cpu;
+
+ /* Get Cluster energy using EM data for the first CPU */
+ cluster_cpus = sched_group_cpus(sg);
+ snprintf(str, 32, "CLUSTER[%*pbl]",
+ cpumask_pr_args(cluster_cpus));
+
+ min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power;
+ max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power;
+ pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+ str, min_pwr, max_pwr);
+
+ /*
+ * Keep track of this cluster's energy in the computation of the
+ * overall system energy
+ */
+ ste->min_power += min_pwr;
+ ste->max_power += max_pwr;
+
+ /* Get CPU energy using EM data for each CPU in the group */
+ for_each_cpu(cpu, cluster_cpus) {
+ /* Get a SD view for the specific CPU */
+ for_each_domain(cpu, sd2) {
+ /* Get the CPU group */
+ sg2 = sd2->groups;
+ min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power;
+ max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power;
+
+ ste->min_power += min_pwr;
+ ste->max_power += max_pwr;
+
+ snprintf(str, 32, "CPU[%d]", cpu);
+ pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+ str, min_pwr, max_pwr);
+
+ /*
+ * Assume we have EM data only at the CPU and
+ * the upper CLUSTER level
+ */
+ BUG_ON(!cpumask_equal(
+ sched_group_cpus(sg),
+ sched_group_cpus(sd2->parent->groups)
+ ));
+ break;
+ }
+ }
+}
+
+/*
+ * Initialize the constants required to compute normalized energy.
+ * The values of these constants depends on the EM data for the specific
+ * target system and topology.
+ * Thus, this function is expected to be called by the code
+ * that bind the EM to the topology information.
+ */
+static int
+schedtune_init(void)
+{
+ struct target_nrg *ste = &schedtune_target_nrg;
+ unsigned long delta_pwr = 0;
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ pr_info("schedtune: init normalization constants...\n");
+ ste->max_power = 0;
+ ste->min_power = 0;
+
+ rcu_read_lock();
+
+ /*
+ * When EAS is in use, we always have a pointer to the highest SD
+ * which provides EM data.
+ */
+ sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
+ if (!sd) {
+ if (energy_aware())
+ pr_warn("schedtune: no energy model data\n");
+ goto nodata;
+ }
+
+ sg = sd->groups;
+ do {
+ schedtune_add_cluster_nrg(sd, sg, ste);
+ } while (sg = sg->next, sg != sd->groups);
+
+ rcu_read_unlock();
+
+ pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
+ "SYSTEM", ste->min_power, ste->max_power);
+
+ /* Compute normalization constants */
+ delta_pwr = ste->max_power - ste->min_power;
+ ste->rdiv = reciprocal_value(delta_pwr);
+ pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n",
+ ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
+
+ schedtune_test_nrg(delta_pwr);
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ schedtune_init_cgroups();
+#else
+ pr_info("schedtune: configured to support global boosting only\n");
+#endif
+
+ return 0;
+
+nodata:
+ rcu_read_unlock();
+ return -EINVAL;
+}
+postcore_initcall(schedtune_init);
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
new file mode 100644
index 000000000000..4f6441771e4c
--- /dev/null
+++ b/kernel/sched/tune.h
@@ -0,0 +1,55 @@
+
+#ifdef CONFIG_SCHED_TUNE
+
+#include <linux/reciprocal_div.h>
+
+/*
+ * System energy normalization constants
+ */
+struct target_nrg {
+ unsigned long min_power;
+ unsigned long max_power;
+ struct reciprocal_value rdiv;
+};
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+
+int schedtune_cpu_boost(int cpu);
+int schedtune_task_boost(struct task_struct *tsk);
+
+int schedtune_prefer_idle(struct task_struct *tsk);
+
+void schedtune_exit_task(struct task_struct *tsk);
+
+void schedtune_enqueue_task(struct task_struct *p, int cpu);
+void schedtune_dequeue_task(struct task_struct *p, int cpu);
+
+#else /* CONFIG_CGROUP_SCHEDTUNE */
+
+#define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost()
+#define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost()
+
+#define schedtune_exit_task(task) do { } while (0)
+
+#define schedtune_enqueue_task(task, cpu) do { } while (0)
+#define schedtune_dequeue_task(task, cpu) do { } while (0)
+
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
+int schedtune_normalize_energy(int energy);
+int schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ struct task_struct *task);
+
+#else /* CONFIG_SCHED_TUNE */
+
+#define schedtune_cpu_boost(cpu) 0
+#define schedtune_task_boost(tsk) 0
+
+#define schedtune_exit_task(task) do { } while (0)
+
+#define schedtune_enqueue_task(task, cpu) do { } while (0)
+#define schedtune_dequeue_task(task, cpu) do { } while (0)
+
+#define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta
+
+#endif /* CONFIG_SCHED_TUNE */
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
new file mode 100644
index 000000000000..07b7f84b37e2
--- /dev/null
+++ b/kernel/sched/walt.c
@@ -0,0 +1,1171 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ *
+ * Window Assisted Load Tracking (WALT) implementation credits:
+ * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
+ * Pavan Kumar Kondeti, Olav Haugan
+ *
+ * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
+ * and Todd Kjos
+ */
+
+#include <linux/syscore_ops.h>
+#include <linux/cpufreq.h>
+#include <trace/events/sched.h>
+#include <clocksource/arm_arch_timer.h>
+#include "sched.h"
+#include "walt.h"
+
+#define WINDOW_STATS_RECENT 0
+#define WINDOW_STATS_MAX 1
+#define WINDOW_STATS_MAX_RECENT_AVG 2
+#define WINDOW_STATS_AVG 3
+#define WINDOW_STATS_INVALID_POLICY 4
+
+#define EXITING_TASK_MARKER 0xdeaddead
+
+static __read_mostly unsigned int walt_ravg_hist_size = 5;
+static __read_mostly unsigned int walt_window_stats_policy =
+ WINDOW_STATS_MAX_RECENT_AVG;
+static __read_mostly unsigned int walt_account_wait_time = 1;
+static __read_mostly unsigned int walt_freq_account_wait_time = 0;
+static __read_mostly unsigned int walt_io_is_busy = 0;
+
+unsigned int sysctl_sched_walt_init_task_load_pct = 15;
+
+/* 1 -> use PELT based load stats, 0 -> use window-based load stats */
+unsigned int __read_mostly walt_disabled = 0;
+
+static unsigned int max_possible_efficiency = 1024;
+static unsigned int min_possible_efficiency = 1024;
+
+/*
+ * Maximum possible frequency across all cpus. Task demand and cpu
+ * capacity (cpu_power) metrics are scaled in reference to it.
+ */
+static unsigned int max_possible_freq = 1;
+
+/*
+ * Minimum possible max_freq across all cpus. This will be same as
+ * max_possible_freq on homogeneous systems and could be different from
+ * max_possible_freq on heterogenous systems. min_max_freq is used to derive
+ * capacity (cpu_power) of cpus.
+ */
+static unsigned int min_max_freq = 1;
+
+static unsigned int max_capacity = 1024;
+static unsigned int min_capacity = 1024;
+static unsigned int max_load_scale_factor = 1024;
+static unsigned int max_possible_capacity = 1024;
+
+/* Mask of all CPUs that have max_possible_capacity */
+static cpumask_t mpc_mask = CPU_MASK_ALL;
+
+/* Window size (in ns) */
+__read_mostly unsigned int walt_ravg_window = 20000000;
+
+/* Min window size (in ns) = 10ms */
+#define MIN_SCHED_RAVG_WINDOW 10000000
+
+/* Max window size (in ns) = 1s */
+#define MAX_SCHED_RAVG_WINDOW 1000000000
+
+static unsigned int sync_cpu;
+static ktime_t ktime_last;
+static bool walt_ktime_suspended;
+
+static unsigned int task_load(struct task_struct *p)
+{
+ return p->ravg.demand;
+}
+
+void
+walt_inc_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p)
+{
+ rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+void
+walt_dec_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p)
+{
+ rq->cumulative_runnable_avg -= p->ravg.demand;
+ BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+}
+
+static void
+fixup_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p, s64 task_load_delta)
+{
+ rq->cumulative_runnable_avg += task_load_delta;
+ if ((s64)rq->cumulative_runnable_avg < 0)
+ panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
+ task_load_delta, task_load(p));
+}
+
+u64 walt_ktime_clock(void)
+{
+ if (unlikely(walt_ktime_suspended))
+ return ktime_to_ns(ktime_last);
+ return ktime_get_ns();
+}
+
+static void walt_resume(void)
+{
+ walt_ktime_suspended = false;
+}
+
+static int walt_suspend(void)
+{
+ ktime_last = ktime_get();
+ walt_ktime_suspended = true;
+ return 0;
+}
+
+static struct syscore_ops walt_syscore_ops = {
+ .resume = walt_resume,
+ .suspend = walt_suspend
+};
+
+static int __init walt_init_ops(void)
+{
+ register_syscore_ops(&walt_syscore_ops);
+ return 0;
+}
+late_initcall(walt_init_ops);
+
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ cfs_rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ cfs_rq->cumulative_runnable_avg -= p->ravg.demand;
+}
+
+static int exiting_task(struct task_struct *p)
+{
+ if (p->flags & PF_EXITING) {
+ if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) {
+ p->ravg.sum_history[0] = EXITING_TASK_MARKER;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int __init set_walt_ravg_window(char *str)
+{
+ get_option(&str, &walt_ravg_window);
+
+ walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+ walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+ return 0;
+}
+
+early_param("walt_ravg_window", set_walt_ravg_window);
+
+static void
+update_window_start(struct rq *rq, u64 wallclock)
+{
+ s64 delta;
+ int nr_windows;
+
+ delta = wallclock - rq->window_start;
+ /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
+ if (delta < 0) {
+ if (arch_timer_read_counter() == 0)
+ delta = 0;
+ else
+ BUG_ON(1);
+ }
+
+ if (delta < walt_ravg_window)
+ return;
+
+ nr_windows = div64_u64(delta, walt_ravg_window);
+ rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+}
+
+static u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+ unsigned int cur_freq = rq->cur_freq;
+ int sf;
+
+ if (unlikely(cur_freq > max_possible_freq))
+ cur_freq = rq->max_possible_freq;
+
+ /* round up div64 */
+ delta = div64_u64(delta * cur_freq + max_possible_freq - 1,
+ max_possible_freq);
+
+ sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency);
+
+ delta *= sf;
+ delta >>= 10;
+
+ return delta;
+}
+
+static int cpu_is_waiting_on_io(struct rq *rq)
+{
+ if (!walt_io_is_busy)
+ return 0;
+
+ return atomic_read(&rq->nr_iowait);
+}
+
+void walt_account_irqtime(int cpu, struct task_struct *curr,
+ u64 delta, u64 wallclock)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags, nr_windows;
+ u64 cur_jiffies_ts;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ /*
+ * cputime (wallclock) uses sched_clock so use the same here for
+ * consistency.
+ */
+ delta += sched_clock() - wallclock;
+ cur_jiffies_ts = get_jiffies_64();
+
+ if (is_idle_task(curr))
+ walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(),
+ delta);
+
+ nr_windows = cur_jiffies_ts - rq->irqload_ts;
+
+ if (nr_windows) {
+ if (nr_windows < 10) {
+ /* Decay CPU's irqload by 3/4 for each window. */
+ rq->avg_irqload *= (3 * nr_windows);
+ rq->avg_irqload = div64_u64(rq->avg_irqload,
+ 4 * nr_windows);
+ } else {
+ rq->avg_irqload = 0;
+ }
+ rq->avg_irqload += rq->cur_irqload;
+ rq->cur_irqload = 0;
+ }
+
+ rq->cur_irqload += delta;
+ rq->irqload_ts = cur_jiffies_ts;
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+
+#define WALT_HIGH_IRQ_TIMEOUT 3
+
+u64 walt_irqload(int cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ s64 delta;
+ delta = get_jiffies_64() - rq->irqload_ts;
+
+ /*
+ * Current context can be preempted by irq and rq->irqload_ts can be
+ * updated by irq context so that delta can be negative.
+ * But this is okay and we can safely return as this means there
+ * was recent irq occurrence.
+ */
+
+ if (delta < WALT_HIGH_IRQ_TIMEOUT)
+ return rq->avg_irqload;
+ else
+ return 0;
+}
+
+int walt_cpu_high_irqload(int cpu) {
+ return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload;
+}
+
+static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
+ u64 irqtime, int event)
+{
+ if (is_idle_task(p)) {
+ /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+ if (event == PICK_NEXT_TASK)
+ return 0;
+
+ /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+ return irqtime || cpu_is_waiting_on_io(rq);
+ }
+
+ if (event == TASK_WAKE)
+ return 0;
+
+ if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
+ event == TASK_UPDATE)
+ return 1;
+
+ /* Only TASK_MIGRATE && PICK_NEXT_TASK left */
+ return walt_freq_account_wait_time;
+}
+
+/*
+ * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime)
+{
+ int new_window, nr_full_windows = 0;
+ int p_is_curr_task = (p == rq->curr);
+ u64 mark_start = p->ravg.mark_start;
+ u64 window_start = rq->window_start;
+ u32 window_size = walt_ravg_window;
+ u64 delta;
+
+ new_window = mark_start < window_start;
+ if (new_window) {
+ nr_full_windows = div64_u64((window_start - mark_start),
+ window_size);
+ if (p->ravg.active_windows < USHRT_MAX)
+ p->ravg.active_windows++;
+ }
+
+ /* Handle per-task window rollover. We don't care about the idle
+ * task or exiting tasks. */
+ if (new_window && !is_idle_task(p) && !exiting_task(p)) {
+ u32 curr_window = 0;
+
+ if (!nr_full_windows)
+ curr_window = p->ravg.curr_window;
+
+ p->ravg.prev_window = curr_window;
+ p->ravg.curr_window = 0;
+ }
+
+ if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
+ /* account_busy_for_cpu_time() = 0, so no update to the
+ * task's current window needs to be made. This could be
+ * for example
+ *
+ * - a wakeup event on a task within the current
+ * window (!new_window below, no action required),
+ * - switching to a new task from idle (PICK_NEXT_TASK)
+ * in a new window where irqtime is 0 and we aren't
+ * waiting on IO */
+
+ if (!new_window)
+ return;
+
+ /* A new window has started. The RQ demand must be rolled
+ * over if p is the current task. */
+ if (p_is_curr_task) {
+ u64 prev_sum = 0;
+
+ /* p is either idle task or an exiting task */
+ if (!nr_full_windows) {
+ prev_sum = rq->curr_runnable_sum;
+ }
+
+ rq->prev_runnable_sum = prev_sum;
+ rq->curr_runnable_sum = 0;
+ }
+
+ return;
+ }
+
+ if (!new_window) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. No rollover
+ * since we didn't start a new window. An example of this is
+ * when a task starts execution and then sleeps within the
+ * same window. */
+
+ if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+ delta = wallclock - mark_start;
+ else
+ delta = irqtime;
+ delta = scale_exec_time(delta, rq);
+ rq->curr_runnable_sum += delta;
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.curr_window += delta;
+
+ return;
+ }
+
+ if (!p_is_curr_task) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has also started, but p is not the current task, so the
+ * window is not rolled over - just split up and account
+ * as necessary into curr and prev. The window is only
+ * rolled over when a new window is processed for the current
+ * task.
+ *
+ * Irqtime can't be accounted by a task that isn't the
+ * currently running task. */
+
+ if (!nr_full_windows) {
+ /* A full window hasn't elapsed, account partial
+ * contribution to previous completed window. */
+ delta = scale_exec_time(window_start - mark_start, rq);
+ if (!exiting_task(p))
+ p->ravg.prev_window += delta;
+ } else {
+ /* Since at least one full window has elapsed,
+ * the contribution to the previous window is the
+ * full window (window_size). */
+ delta = scale_exec_time(window_size, rq);
+ if (!exiting_task(p))
+ p->ravg.prev_window = delta;
+ }
+ rq->prev_runnable_sum += delta;
+
+ /* Account piece of busy time in the current window. */
+ delta = scale_exec_time(wallclock - window_start, rq);
+ rq->curr_runnable_sum += delta;
+ if (!exiting_task(p))
+ p->ravg.curr_window = delta;
+
+ return;
+ }
+
+ if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has started and p is the current task so rollover is
+ * needed. If any of these three above conditions are true
+ * then this busy time can't be accounted as irqtime.
+ *
+ * Busy time for the idle task or exiting tasks need not
+ * be accounted.
+ *
+ * An example of this would be a task that starts execution
+ * and then sleeps once a new window has begun. */
+
+ if (!nr_full_windows) {
+ /* A full window hasn't elapsed, account partial
+ * contribution to previous completed window. */
+ delta = scale_exec_time(window_start - mark_start, rq);
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.prev_window += delta;
+
+ delta += rq->curr_runnable_sum;
+ } else {
+ /* Since at least one full window has elapsed,
+ * the contribution to the previous window is the
+ * full window (window_size). */
+ delta = scale_exec_time(window_size, rq);
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.prev_window = delta;
+
+ }
+ /*
+ * Rollover for normal runnable sum is done here by overwriting
+ * the values in prev_runnable_sum and curr_runnable_sum.
+ * Rollover for new task runnable sum has completed by previous
+ * if-else statement.
+ */
+ rq->prev_runnable_sum = delta;
+
+ /* Account piece of busy time in the current window. */
+ delta = scale_exec_time(wallclock - window_start, rq);
+ rq->curr_runnable_sum = delta;
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.curr_window = delta;
+
+ return;
+ }
+
+ if (irqtime) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has started and p is the current task so rollover is
+ * needed. The current task must be the idle task because
+ * irqtime is not accounted for any other task.
+ *
+ * Irqtime will be accounted each time we process IRQ activity
+ * after a period of idleness, so we know the IRQ busy time
+ * started at wallclock - irqtime. */
+
+ BUG_ON(!is_idle_task(p));
+ mark_start = wallclock - irqtime;
+
+ /* Roll window over. If IRQ busy time was just in the current
+ * window then that is all that need be accounted. */
+ rq->prev_runnable_sum = rq->curr_runnable_sum;
+ if (mark_start > window_start) {
+ rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
+ return;
+ }
+
+ /* The IRQ busy time spanned multiple windows. Process the
+ * busy time preceding the current window start first. */
+ delta = window_start - mark_start;
+ if (delta > window_size)
+ delta = window_size;
+ delta = scale_exec_time(delta, rq);
+ rq->prev_runnable_sum += delta;
+
+ /* Process the remaining IRQ busy time in the current window. */
+ delta = wallclock - window_start;
+ rq->curr_runnable_sum = scale_exec_time(delta, rq);
+
+ return;
+ }
+
+ BUG();
+}
+
+static int account_busy_for_task_demand(struct task_struct *p, int event)
+{
+ /* No need to bother updating task demand for exiting tasks
+ * or the idle task. */
+ if (exiting_task(p) || is_idle_task(p))
+ return 0;
+
+ /* When a task is waking up it is completing a segment of non-busy
+ * time. Likewise, if wait time is not treated as busy time, then
+ * when a task begins to run or is migrated, it is not running and
+ * is completing a segment of non-busy time. */
+ if (event == TASK_WAKE || (!walt_account_wait_time &&
+ (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Called when new window is starting for a task, to record cpu usage over
+ * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
+ * when, say, a real-time task runs without preemption for several windows at a
+ * stretch.
+ */
+static void update_history(struct rq *rq, struct task_struct *p,
+ u32 runtime, int samples, int event)
+{
+ u32 *hist = &p->ravg.sum_history[0];
+ int ridx, widx;
+ u32 max = 0, avg, demand;
+ u64 sum = 0;
+
+ /* Ignore windows where task had no activity */
+ if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
+ goto done;
+
+ /* Push new 'runtime' value onto stack */
+ widx = walt_ravg_hist_size - 1;
+ ridx = widx - samples;
+ for (; ridx >= 0; --widx, --ridx) {
+ hist[widx] = hist[ridx];
+ sum += hist[widx];
+ if (hist[widx] > max)
+ max = hist[widx];
+ }
+
+ for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) {
+ hist[widx] = runtime;
+ sum += hist[widx];
+ if (hist[widx] > max)
+ max = hist[widx];
+ }
+
+ p->ravg.sum = 0;
+
+ if (walt_window_stats_policy == WINDOW_STATS_RECENT) {
+ demand = runtime;
+ } else if (walt_window_stats_policy == WINDOW_STATS_MAX) {
+ demand = max;
+ } else {
+ avg = div64_u64(sum, walt_ravg_hist_size);
+ if (walt_window_stats_policy == WINDOW_STATS_AVG)
+ demand = avg;
+ else
+ demand = max(avg, runtime);
+ }
+
+ /*
+ * A throttled deadline sched class task gets dequeued without
+ * changing p->on_rq. Since the dequeue decrements hmp stats
+ * avoid decrementing it here again.
+ */
+ if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
+ !p->dl.dl_throttled))
+ fixup_cumulative_runnable_avg(rq, p, demand);
+
+ p->ravg.demand = demand;
+
+done:
+ trace_walt_update_history(rq, p, runtime, samples, event);
+ return;
+}
+
+static void add_to_task_demand(struct rq *rq, struct task_struct *p,
+ u64 delta)
+{
+ delta = scale_exec_time(delta, rq);
+ p->ravg.sum += delta;
+ if (unlikely(p->ravg.sum > walt_ravg_window))
+ p->ravg.sum = walt_ravg_window;
+}
+
+/*
+ * Account cpu demand of task and/or update task's cpu demand history
+ *
+ * ms = p->ravg.mark_start;
+ * wc = wallclock
+ * ws = rq->window_start
+ *
+ * Three possibilities:
+ *
+ * a) Task event is contained within one window.
+ * window_start < mark_start < wallclock
+ *
+ * ws ms wc
+ * | | |
+ * V V V
+ * |---------------|
+ *
+ * In this case, p->ravg.sum is updated *iff* event is appropriate
+ * (ex: event == PUT_PREV_TASK)
+ *
+ * b) Task event spans two windows.
+ * mark_start < window_start < wallclock
+ *
+ * ms ws wc
+ * | | |
+ * V V V
+ * -----|-------------------
+ *
+ * In this case, p->ravg.sum is updated with (ws - ms) *iff* event
+ * is appropriate, then a new window sample is recorded followed
+ * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
+ *
+ * c) Task event spans more than two windows.
+ *
+ * ms ws_tmp ws wc
+ * | | | |
+ * V V V V
+ * ---|-------|-------|-------|-------|------
+ * | |
+ * |<------ nr_full_windows ------>|
+ *
+ * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
+ * event is appropriate, window sample of p->ravg.sum is recorded,
+ * 'nr_full_window' samples of window_size is also recorded *iff*
+ * event is appropriate and finally p->ravg.sum is set to (wc - ws)
+ * *iff* event is appropriate.
+ *
+ * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
+ * depends on it!
+ */
+static void update_task_demand(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock)
+{
+ u64 mark_start = p->ravg.mark_start;
+ u64 delta, window_start = rq->window_start;
+ int new_window, nr_full_windows;
+ u32 window_size = walt_ravg_window;
+
+ new_window = mark_start < window_start;
+ if (!account_busy_for_task_demand(p, event)) {
+ if (new_window)
+ /* If the time accounted isn't being accounted as
+ * busy time, and a new window started, only the
+ * previous window need be closed out with the
+ * pre-existing demand. Multiple windows may have
+ * elapsed, but since empty windows are dropped,
+ * it is not necessary to account those. */
+ update_history(rq, p, p->ravg.sum, 1, event);
+ return;
+ }
+
+ if (!new_window) {
+ /* The simple case - busy time contained within the existing
+ * window. */
+ add_to_task_demand(rq, p, wallclock - mark_start);
+ return;
+ }
+
+ /* Busy time spans at least two windows. Temporarily rewind
+ * window_start to first window boundary after mark_start. */
+ delta = window_start - mark_start;
+ nr_full_windows = div64_u64(delta, window_size);
+ window_start -= (u64)nr_full_windows * (u64)window_size;
+
+ /* Process (window_start - mark_start) first */
+ add_to_task_demand(rq, p, window_start - mark_start);
+
+ /* Push new sample(s) into task's demand history */
+ update_history(rq, p, p->ravg.sum, 1, event);
+ if (nr_full_windows)
+ update_history(rq, p, scale_exec_time(window_size, rq),
+ nr_full_windows, event);
+
+ /* Roll window_start back to current to process any remainder
+ * in current window. */
+ window_start += (u64)nr_full_windows * (u64)window_size;
+
+ /* Process (wallclock - window_start) next */
+ mark_start = window_start;
+ add_to_task_demand(rq, p, wallclock - mark_start);
+}
+
+/* Reflect task activity on its demand and cpu's busy time statistics */
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime)
+{
+ if (walt_disabled || !rq->window_start)
+ return;
+
+ lockdep_assert_held(&rq->lock);
+
+ update_window_start(rq, wallclock);
+
+ if (!p->ravg.mark_start)
+ goto done;
+
+ update_task_demand(p, rq, event, wallclock);
+ update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+
+done:
+ trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
+
+ p->ravg.mark_start = wallclock;
+}
+
+unsigned long __weak arch_get_cpu_efficiency(int cpu)
+{
+ return SCHED_LOAD_SCALE;
+}
+
+void walt_init_cpu_efficiency(void)
+{
+ int i, efficiency;
+ unsigned int max = 0, min = UINT_MAX;
+
+ for_each_possible_cpu(i) {
+ efficiency = arch_get_cpu_efficiency(i);
+ cpu_rq(i)->efficiency = efficiency;
+
+ if (efficiency > max)
+ max = efficiency;
+ if (efficiency < min)
+ min = efficiency;
+ }
+
+ if (max)
+ max_possible_efficiency = max;
+
+ if (min)
+ min_possible_efficiency = min;
+}
+
+static void reset_task_stats(struct task_struct *p)
+{
+ u32 sum = 0;
+
+ if (exiting_task(p))
+ sum = EXITING_TASK_MARKER;
+
+ memset(&p->ravg, 0, sizeof(struct ravg));
+ /* Retain EXITING_TASK marker */
+ p->ravg.sum_history[0] = sum;
+}
+
+void walt_mark_task_starting(struct task_struct *p)
+{
+ u64 wallclock;
+ struct rq *rq = task_rq(p);
+
+ if (!rq->window_start) {
+ reset_task_stats(p);
+ return;
+ }
+
+ wallclock = walt_ktime_clock();
+ p->ravg.mark_start = wallclock;
+}
+
+void walt_set_window_start(struct rq *rq)
+{
+ int cpu = cpu_of(rq);
+ struct rq *sync_rq = cpu_rq(sync_cpu);
+
+ if (rq->window_start)
+ return;
+
+ if (cpu == sync_cpu) {
+ rq->window_start = walt_ktime_clock();
+ } else {
+ raw_spin_unlock(&rq->lock);
+ double_rq_lock(rq, sync_rq);
+ rq->window_start = cpu_rq(sync_cpu)->window_start;
+ rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+ raw_spin_unlock(&sync_rq->lock);
+ }
+
+ rq->curr->ravg.mark_start = rq->window_start;
+}
+
+void walt_migrate_sync_cpu(int cpu)
+{
+ if (cpu == sync_cpu)
+ sync_cpu = smp_processor_id();
+}
+
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+ struct rq *src_rq = task_rq(p);
+ struct rq *dest_rq = cpu_rq(new_cpu);
+ u64 wallclock;
+
+ if (!p->on_rq && p->state != TASK_WAKING)
+ return;
+
+ if (exiting_task(p)) {
+ return;
+ }
+
+ if (p->state == TASK_WAKING)
+ double_rq_lock(src_rq, dest_rq);
+
+ wallclock = walt_ktime_clock();
+
+ walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
+ TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(dest_rq->curr, dest_rq,
+ TASK_UPDATE, wallclock, 0);
+
+ walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
+
+ if (p->ravg.curr_window) {
+ src_rq->curr_runnable_sum -= p->ravg.curr_window;
+ dest_rq->curr_runnable_sum += p->ravg.curr_window;
+ }
+
+ if (p->ravg.prev_window) {
+ src_rq->prev_runnable_sum -= p->ravg.prev_window;
+ dest_rq->prev_runnable_sum += p->ravg.prev_window;
+ }
+
+ if ((s64)src_rq->prev_runnable_sum < 0) {
+ src_rq->prev_runnable_sum = 0;
+ WARN_ON(1);
+ }
+ if ((s64)src_rq->curr_runnable_sum < 0) {
+ src_rq->curr_runnable_sum = 0;
+ WARN_ON(1);
+ }
+
+ trace_walt_migration_update_sum(src_rq, p);
+ trace_walt_migration_update_sum(dest_rq, p);
+
+ if (p->state == TASK_WAKING)
+ double_rq_unlock(src_rq, dest_rq);
+}
+
+/* Keep track of max/min capacity possible across CPUs "currently" */
+static void __update_min_max_capacity(void)
+{
+ int i;
+ int max = 0, min = INT_MAX;
+
+ for_each_online_cpu(i) {
+ if (cpu_rq(i)->capacity > max)
+ max = cpu_rq(i)->capacity;
+ if (cpu_rq(i)->capacity < min)
+ min = cpu_rq(i)->capacity;
+ }
+
+ max_capacity = max;
+ min_capacity = min;
+}
+
+static void update_min_max_capacity(void)
+{
+ unsigned long flags;
+ int i;
+
+ local_irq_save(flags);
+ for_each_possible_cpu(i)
+ raw_spin_lock(&cpu_rq(i)->lock);
+
+ __update_min_max_capacity();
+
+ for_each_possible_cpu(i)
+ raw_spin_unlock(&cpu_rq(i)->lock);
+ local_irq_restore(flags);
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
+ * least efficient cpu gets capacity of 1024
+ */
+static unsigned long capacity_scale_cpu_efficiency(int cpu)
+{
+ return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency;
+}
+
+/*
+ * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
+ * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
+ */
+static unsigned long capacity_scale_cpu_freq(int cpu)
+{
+ return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq;
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
+ * that "most" efficient cpu gets a load_scale_factor of 1
+ */
+static unsigned long load_scale_cpu_efficiency(int cpu)
+{
+ return DIV_ROUND_UP(1024 * max_possible_efficiency,
+ cpu_rq(cpu)->efficiency);
+}
+
+/*
+ * Return load_scale_factor of a cpu in reference to cpu with best max_freq
+ * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
+ * of 1.
+ */
+static unsigned long load_scale_cpu_freq(int cpu)
+{
+ return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq);
+}
+
+static int compute_capacity(int cpu)
+{
+ int capacity = 1024;
+
+ capacity *= capacity_scale_cpu_efficiency(cpu);
+ capacity >>= 10;
+
+ capacity *= capacity_scale_cpu_freq(cpu);
+ capacity >>= 10;
+
+ return capacity;
+}
+
+static int compute_load_scale_factor(int cpu)
+{
+ int load_scale = 1024;
+
+ /*
+ * load_scale_factor accounts for the fact that task load
+ * is in reference to "best" performing cpu. Task's load will need to be
+ * scaled (up) by a factor to determine suitability to be placed on a
+ * (little) cpu.
+ */
+ load_scale *= load_scale_cpu_efficiency(cpu);
+ load_scale >>= 10;
+
+ load_scale *= load_scale_cpu_freq(cpu);
+ load_scale >>= 10;
+
+ return load_scale;
+}
+
+static int cpufreq_notifier_policy(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
+ int i, update_max = 0;
+ u64 highest_mpc = 0, highest_mplsf = 0;
+ const struct cpumask *cpus = policy->related_cpus;
+ unsigned int orig_min_max_freq = min_max_freq;
+ unsigned int orig_max_possible_freq = max_possible_freq;
+ /* Initialized to policy->max in case policy->related_cpus is empty! */
+ unsigned int orig_max_freq = policy->max;
+
+ if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
+ val != CPUFREQ_CREATE_POLICY)
+ return 0;
+
+ if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
+ update_min_max_capacity();
+ return 0;
+ }
+
+ for_each_cpu(i, policy->related_cpus) {
+ cpumask_copy(&cpu_rq(i)->freq_domain_cpumask,
+ policy->related_cpus);
+ orig_max_freq = cpu_rq(i)->max_freq;
+ cpu_rq(i)->min_freq = policy->min;
+ cpu_rq(i)->max_freq = policy->max;
+ cpu_rq(i)->cur_freq = policy->cur;
+ cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq;
+ }
+
+ max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
+ if (min_max_freq == 1)
+ min_max_freq = UINT_MAX;
+ min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
+ BUG_ON(!min_max_freq);
+ BUG_ON(!policy->max);
+
+ /* Changes to policy other than max_freq don't require any updates */
+ if (orig_max_freq == policy->max)
+ return 0;
+
+ /*
+ * A changed min_max_freq or max_possible_freq (possible during bootup)
+ * needs to trigger re-computation of load_scale_factor and capacity for
+ * all possible cpus (even those offline). It also needs to trigger
+ * re-computation of nr_big_task count on all online cpus.
+ *
+ * A changed rq->max_freq otoh needs to trigger re-computation of
+ * load_scale_factor and capacity for just the cluster of cpus involved.
+ * Since small task definition depends on max_load_scale_factor, a
+ * changed load_scale_factor of one cluster could influence
+ * classification of tasks in another cluster. Hence a changed
+ * rq->max_freq will need to trigger re-computation of nr_big_task
+ * count on all online cpus.
+ *
+ * While it should be sufficient for nr_big_tasks to be
+ * re-computed for only online cpus, we have inadequate context
+ * information here (in policy notifier) with regard to hotplug-safety
+ * context in which notification is issued. As a result, we can't use
+ * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is
+ * fixed up to issue notification always in hotplug-safe context,
+ * re-compute nr_big_task for all possible cpus.
+ */
+
+ if (orig_min_max_freq != min_max_freq ||
+ orig_max_possible_freq != max_possible_freq) {
+ cpus = cpu_possible_mask;
+ update_max = 1;
+ }
+
+ /*
+ * Changed load_scale_factor can trigger reclassification of tasks as
+ * big or small. Make this change "atomic" so that tasks are accounted
+ * properly due to changed load_scale_factor
+ */
+ for_each_cpu(i, cpus) {
+ struct rq *rq = cpu_rq(i);
+
+ rq->capacity = compute_capacity(i);
+ rq->load_scale_factor = compute_load_scale_factor(i);
+
+ if (update_max) {
+ u64 mpc, mplsf;
+
+ mpc = div_u64(((u64) rq->capacity) *
+ rq->max_possible_freq, rq->max_freq);
+ rq->max_possible_capacity = (int) mpc;
+
+ mplsf = div_u64(((u64) rq->load_scale_factor) *
+ rq->max_possible_freq, rq->max_freq);
+
+ if (mpc > highest_mpc) {
+ highest_mpc = mpc;
+ cpumask_clear(&mpc_mask);
+ cpumask_set_cpu(i, &mpc_mask);
+ } else if (mpc == highest_mpc) {
+ cpumask_set_cpu(i, &mpc_mask);
+ }
+
+ if (mplsf > highest_mplsf)
+ highest_mplsf = mplsf;
+ }
+ }
+
+ if (update_max) {
+ max_possible_capacity = highest_mpc;
+ max_load_scale_factor = highest_mplsf;
+ }
+
+ __update_min_max_capacity();
+
+ return 0;
+}
+
+static int cpufreq_notifier_trans(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
+ unsigned int cpu = freq->cpu, new_freq = freq->new;
+ unsigned long flags;
+ int i;
+
+ if (val != CPUFREQ_POSTCHANGE)
+ return 0;
+
+ BUG_ON(!new_freq);
+
+ if (cpu_rq(cpu)->cur_freq == new_freq)
+ return 0;
+
+ for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) {
+ struct rq *rq = cpu_rq(i);
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+ walt_ktime_clock(), 0);
+ rq->cur_freq = new_freq;
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+ return 0;
+}
+
+static struct notifier_block notifier_policy_block = {
+ .notifier_call = cpufreq_notifier_policy
+};
+
+static struct notifier_block notifier_trans_block = {
+ .notifier_call = cpufreq_notifier_trans
+};
+
+static int register_sched_callback(void)
+{
+ int ret;
+
+ ret = cpufreq_register_notifier(&notifier_policy_block,
+ CPUFREQ_POLICY_NOTIFIER);
+
+ if (!ret)
+ ret = cpufreq_register_notifier(&notifier_trans_block,
+ CPUFREQ_TRANSITION_NOTIFIER);
+
+ return 0;
+}
+
+/*
+ * cpufreq callbacks can be registered at core_initcall or later time.
+ * Any registration done prior to that is "forgotten" by cpufreq. See
+ * initialization of variable init_cpufreq_transition_notifier_list_called
+ * for further information.
+ */
+core_initcall(register_sched_callback);
+
+void walt_init_new_task_load(struct task_struct *p)
+{
+ int i;
+ u32 init_load_windows =
+ div64_u64((u64)sysctl_sched_walt_init_task_load_pct *
+ (u64)walt_ravg_window, 100);
+ u32 init_load_pct = current->init_load_pct;
+
+ p->init_load_pct = 0;
+ memset(&p->ravg, 0, sizeof(struct ravg));
+
+ if (init_load_pct) {
+ init_load_windows = div64_u64((u64)init_load_pct *
+ (u64)walt_ravg_window, 100);
+ }
+
+ p->ravg.demand = init_load_windows;
+ for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+ p->ravg.sum_history[i] = init_load_windows;
+}
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
new file mode 100644
index 000000000000..e181c87a928d
--- /dev/null
+++ b/kernel/sched/walt.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __WALT_H
+#define __WALT_H
+
+#ifdef CONFIG_SCHED_WALT
+
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+ u64 wallclock, u64 irqtime);
+void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p);
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p);
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu);
+void walt_init_new_task_load(struct task_struct *p);
+void walt_mark_task_starting(struct task_struct *p);
+void walt_set_window_start(struct rq *rq);
+void walt_migrate_sync_cpu(int cpu);
+void walt_init_cpu_efficiency(void);
+u64 walt_ktime_clock(void);
+void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta,
+ u64 wallclock);
+
+u64 walt_irqload(int cpu);
+int walt_cpu_high_irqload(int cpu);
+
+#else /* CONFIG_SCHED_WALT */
+
+static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime) { }
+static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p) { }
+static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p) { }
+static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { }
+static inline void walt_init_new_task_load(struct task_struct *p) { }
+static inline void walt_mark_task_starting(struct task_struct *p) { }
+static inline void walt_set_window_start(struct rq *rq) { }
+static inline void walt_migrate_sync_cpu(int cpu) { }
+static inline void walt_init_cpu_efficiency(void) { }
+static inline u64 walt_ktime_clock(void) { return 0; }
+
+#endif /* CONFIG_SCHED_WALT */
+
+extern unsigned int walt_disabled;
+
+#endif