summaryrefslogtreecommitdiff
path: root/kernel/sched/rt.c (follow)
Commit message (Collapse)AuthorAge
...
* | | Merge branch 'v4.4-16.09-android-tmp' into lsk-v4.4-16.09-androidRunmin Wang2016-12-16
|\ \ \ | |/ / |/| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | * v4.4-16.09-android-tmp: unsafe_[get|put]_user: change interface to use a error target label usercopy: remove page-spanning test for now usercopy: fix overlap check for kernel text mm/slub: support left redzone Linux 4.4.21 lib/mpi: mpi_write_sgl(): fix skipping of leading zero limbs regulator: anatop: allow regulator to be in bypass mode hwrng: exynos - Disable runtime PM on probe failure cpufreq: Fix GOV_LIMITS handling for the userspace governor metag: Fix atomic_*_return inline asm constraints scsi: fix upper bounds check of sense key in scsi_sense_key_string() ALSA: timer: fix NULL pointer dereference on memory allocation failure ALSA: timer: fix division by zero after SNDRV_TIMER_IOCTL_CONTINUE ALSA: timer: fix NULL pointer dereference in read()/ioctl() race ALSA: hda - Enable subwoofer on Dell Inspiron 7559 ALSA: hda - Add headset mic quirk for Dell Inspiron 5468 ALSA: rawmidi: Fix possible deadlock with virmidi registration ALSA: fireworks: accessing to user space outside spinlock ALSA: firewire-tascam: accessing to user space outside spinlock ALSA: usb-audio: Add sample rate inquiry quirk for B850V3 CP2114 crypto: caam - fix IV loading for authenc (giv)decryption uprobes: Fix the memcg accounting x86/apic: Do not init irq remapping if ioapic is disabled vhost/scsi: fix reuse of &vq->iov[out] in response bcache: RESERVE_PRIO is too small by one when prio_buckets() is a power of two. ubifs: Fix assertion in layout_in_gaps() ovl: fix workdir creation ovl: listxattr: use strnlen() ovl: remove posix_acl_default from workdir ovl: don't copy up opaqueness wrappers for ->i_mutex access lustre: remove unused declaration timekeeping: Avoid taking lock in NMI path with CONFIG_DEBUG_TIMEKEEPING timekeeping: Cap array access in timekeeping_debug xfs: fix superblock inprogress check ASoC: atmel_ssc_dai: Don't unconditionally reset SSC on stream startup drm/msm: fix use of copy_from_user() while holding spinlock drm: Reject page_flip for !DRIVER_MODESET drm/radeon: fix radeon_move_blit on 32bit systems s390/sclp_ctl: fix potential information leak with /dev/sclp rds: fix an infoleak in rds_inc_info_copy powerpc/tm: Avoid SLB faults in treclaim/trecheckpoint when RI=0 nvme: Call pci_disable_device on the error path. cgroup: reduce read locked section of cgroup_threadgroup_rwsem during fork block: make sure a big bio is split into at most 256 bvecs block: Fix race triggered by blk_set_queue_dying() ext4: avoid modifying checksum fields directly during checksum verification ext4: avoid deadlock when expanding inode size ext4: properly align shifted xattrs when expanding inodes ext4: fix xattr shifting when expanding inodes part 2 ext4: fix xattr shifting when expanding inodes ext4: validate that metadata blocks do not overlap superblock net: Use ns_capable_noaudit() when determining net sysctl permissions kernel: Add noaudit variant of ns_capable() KEYS: Fix ASN.1 indefinite length object parsing drivers:hv: Lock access to hyperv_mmio resource tree cxlflash: Move to exponential back-off when cmd_room is not available netfilter: x_tables: check for size overflow drm/amdgpu/cz: enable/disable vce dpm even if vce pg is disabled cred: Reject inodes with invalid ids in set_create_file_as() fs: Check for invalid i_uid in may_follow_link() IB/IPoIB: Do not set skb truesize since using one linearskb udp: properly support MSG_PEEK with truncated buffers crypto: nx-842 - Mask XERS0 bit in return value cxlflash: Fix to avoid virtual LUN failover failure cxlflash: Fix to escalate LINK_RESET also on port 1 tipc: fix nl compat regression for link statistics tipc: fix an infoleak in tipc_nl_compat_link_dump netfilter: x_tables: check for size overflow Bluetooth: Add support for Intel Bluetooth device 8265 [8087:0a2b] drm/i915: Check VBT for port presence in addition to the strap on VLV/CHV drm/i915: Only ignore eDP ports that are connected Input: xpad - move pending clear to the correct location net: thunderx: Fix link status reporting x86/hyperv: Avoid reporting bogus NMI status for Gen2 instances crypto: vmx - IV size failing on skcipher API tda10071: Fix dependency to REGMAP_I2C crypto: vmx - Fix ABI detection crypto: vmx - comply with ABIs that specify vrsave as reserved. HID: core: prevent out-of-bound readings lpfc: Fix DMA faults observed upon plugging loopback connector block: fix blk_rq_get_max_sectors for driver private requests irqchip/gicv3-its: numa: Enable workaround for Cavium thunderx erratum 23144 clocksource: Allow unregistering the watchdog btrfs: Continue write in case of can_not_nocow blk-mq: End unstarted requests on dying queue cxlflash: Fix to resolve dead-lock during EEH recovery drm/radeon/mst: fix regression in lane/link handling. ecryptfs: fix handling of directory opening ALSA: hda: add AMD Polaris-10/11 AZ PCI IDs with proper driver caps drm: Balance error path for GEM handle allocation ntp: Fix ADJ_SETOFFSET being used w/ ADJ_NANO time: Verify time values in adjtimex ADJ_SETOFFSET to avoid overflow Input: xpad - correctly handle concurrent LED and FF requests net: thunderx: Fix receive packet stats net: thunderx: Fix for multiqset not configured upon interface toggle perf/x86/cqm: Fix CQM memory leak and notifier leak perf/x86/cqm: Fix CQM handling of grouping events into a cache_group s390/crypto: provide correct file mode at device register. proc: revert /proc/<pid>/maps [stack:TID] annotation intel_idle: Support for Intel Xeon Phi Processor x200 Product Family cxlflash: Fix to avoid unnecessary scan with internal LUNs Drivers: hv: vmbus: don't manipulate with clocksources on crash Drivers: hv: vmbus: avoid scheduling in interrupt context in vmbus_initiate_unload() Drivers: hv: vmbus: avoid infinite loop in init_vp_index() arcmsr: fixes not release allocated resource arcmsr: fixed getting wrong configuration data s390/pci_dma: fix DMA table corruption with > 4 TB main memory net/mlx5e: Don't modify CQ before it was created net/mlx5e: Don't try to modify CQ moderation if it is not supported mmc: sdhci: Do not BUG on invalid vdd UVC: Add support for R200 depth camera sched/numa: Fix use-after-free bug in the task_numa_compare ALSA: hda - add codec support for Kabylake display audio codec drm/i915: Fix hpd live status bits for g4x tipc: fix nullptr crash during subscription cancel arm64: Add workaround for Cavium erratum 27456 net: thunderx: Fix for Qset error due to CQ full drm/radeon: fix dp link rate selection (v2) drm/amdgpu: fix dp link rate selection (v2) qla2xxx: Use ATIO type to send correct tmr response mmc: sdhci: 64-bit DMA actually has 4-byte alignment drm/atomic: Do not unset crtc when an encoder is stolen drm/i915/skl: Add missing SKL ids drm/i915/bxt: update list of PCIIDs hrtimer: Catch illegal clockids i40e/i40evf: Fix RSS rx-flow-hash configuration through ethtool mpt3sas: Fix for Asynchronous completion of timedout IO and task abort of timedout IO. mpt3sas: A correction in unmap_resources net: cavium: liquidio: fix check for in progress flag arm64: KVM: Configure TCR_EL2.PS at runtime irqchip/gic-v3: Make sure read from ICC_IAR1_EL1 is visible on redestributor pwm: lpc32xx: fix and simplify duty cycle and period calculations pwm: lpc32xx: correct number of PWM channels from 2 to 1 pwm: fsl-ftm: Fix clock enable/disable when using PM megaraid_sas: Add an i/o barrier megaraid_sas: Fix SMAP issue megaraid_sas: Do not allow PCI access during OCR s390/cio: update measurement characteristics s390/cio: ensure consistent measurement state s390/cio: fix measurement characteristics memleak qeth: initialize net_device with carrier off lpfc: Fix external loopback failure. lpfc: Fix mbox reuse in PLOGI completion lpfc: Fix RDP Speed reporting. lpfc: Fix crash in fcp command completion path. lpfc: Fix driver crash when module parameter lpfc_fcp_io_channel set to 16 lpfc: Fix RegLogin failed error seen on Lancer FC during port bounce lpfc: Fix the FLOGI discovery logic to comply with T11 standards lpfc: Fix FCF Infinite loop in lpfc_sli4_fcf_rr_next_index_get. cxl: Enable PCI device ID for future IBM CXL adapter cxl: fix build for GCC 4.6.x cxlflash: Enable device id for future IBM CXL adapter cxlflash: Resolve oops in wait_port_offline cxlflash: Fix to resolve cmd leak after host reset cxl: Fix DSI misses when the context owning task exits cxl: Fix possible idr warning when contexts are released Drivers: hv: vmbus: fix rescind-offer handling for device without a driver Drivers: hv: vmbus: serialize process_chn_event() and vmbus_close_internal() Drivers: hv: vss: run only on supported host versions drivers/hv: cleanup synic msrs if vmbus connect failed Drivers: hv: util: catch allocation errors tools: hv: report ENOSPC errors in hv_fcopy_daemon Drivers: hv: utils: run polling callback always in interrupt context Drivers: hv: util: Increase the timeout for util services lightnvm: fix missing grown bad block type lightnvm: fix locking and mempool in rrpc_lun_gc lightnvm: unlock rq and free ppa_list on submission fail lightnvm: add check after mempool allocation lightnvm: fix incorrect nr_free_blocks stat lightnvm: fix bio submission issue cxlflash: a couple off by one bugs fm10k: Cleanup exception handling for mailbox interrupt fm10k: Cleanup MSI-X interrupts in case of failure fm10k: reinitialize queuing scheme after calling init_hw fm10k: always check init_hw for errors fm10k: reset max_queues on init_hw_vf failure fm10k: Fix handling of NAPI budget when multiple queues are enabled per vector fm10k: Correct MTU for jumbo frames fm10k: do not assume VF always has 1 queue clk: xgene: Fix divider with non-zero shift value e1000e: fix division by zero on jumbo MTUs e1000: fix data race between tx_ring->next_to_clean ixgbe: Fix handling of NAPI budget when multiple queues are enabled per vector igb: fix NULL derefs due to skipped SR-IOV enabling igb: use the correct i210 register for EEMNGCTL igb: don't unmap NULL hw_addr i40e: Fix Rx hash reported to the stack by our driver i40e: clean whole mac filter list i40evf: check rings before freeing resources i40e: don't add zero MAC filter i40e: properly delete VF MAC filters i40e: Fix memory leaks, sideband filter programming i40e: fix: do not sleep in netdev_ops i40e/i40evf: Fix RS bit update in Tx path and disable force WB workaround i40evf: handle many MAC filters correctly i40e: Workaround fix for mss < 256 issue UPSTREAM: audit: fix a double fetch in audit_log_single_execve_arg() UPSTREAM: ARM: 8494/1: mm: Enable PXN when running non-LPAE kernel on LPAE processor FIXUP: sched/tune: update accouting before CPU capacity FIXUP: sched/tune: add fixes missing from a previous patch arm: Fix #if/#ifdef typo in topology.c arm: Fix build error "conflicting types for 'scale_cpu_capacity'" sched/walt: use do_div instead of division operator DEBUG: cpufreq: fix cpu_capacity tracing build for non-smp systems sched/walt: include missing header for arm_timer_read_counter() cpufreq: Kconfig: Fixup incorrect selection by CPU_FREQ_DEFAULT_GOV_SCHED sched/fair: Avoid redundant idle_cpu() call in update_sg_lb_stats() FIXUP: sched: scheduler-driven cpu frequency selection sched/rt: Add Kconfig option to enable panicking for RT throttling sched/rt: print RT tasks when RT throttling is activated UPSTREAM: sched: Fix a race between __kthread_bind() and sched_setaffinity() sched/fair: Favor higher cpus only for boosted tasks vmstat: make vmstat_updater deferrable again and shut down on idle sched/fair: call OPP update when going idle after migration sched/cpufreq_sched: fix thermal capping events sched/fair: Picking cpus with low OPPs for tasks that prefer idle CPUs FIXUP: sched/tune: do initialization as a postcore_initicall DEBUG: sched: add tracepoint for RD overutilized sched/tune: Introducing a new schedtune attribute prefer_idle sched: use util instead of capacity to select busy cpu arch_timer: add error handling when the MPM global timer is cleared FIXUP: sched: Fix double-release of spinlock in move_queued_task FIXUP: sched/fair: Fix hang during suspend in sched_group_energy FIXUP: sched: fix SchedFreq integration for both PELT and WALT sched: EAS: Avoid causing spikes to max-freq unnecessarily FIXUP: sched: fix set_cfs_cpu_capacity when WALT is in use sched/walt: Accounting for number of irqs pending on each core sched: Introduce Window Assisted Load Tracking (WALT) sched/tune: fix PB and PC cuts indexes definition sched/fair: optimize idle cpu selection for boosted tasks FIXUP: sched/tune: fix accounting for runnable tasks sched/tune: use a single initialisation function sched/{fair,tune}: simplify fair.c code FIXUP: sched/tune: fix payoff calculation for boost region sched/tune: Add support for negative boost values FIX: sched/tune: move schedtune_nornalize_energy into fair.c FIX: sched/tune: update usage of boosted task utilisation on CPU selection sched/fair: add tunable to set initial task load sched/fair: add tunable to force selection at cpu granularity sched: EAS: take cstate into account when selecting idle core sched/cpufreq_sched: Consolidated update FIXUP: sched: fix build for non-SMP target DEBUG: sched/tune: add tracepoint on P-E space filtering DEBUG: sched/tune: add tracepoint for energy_diff() values DEBUG: sched/tune: add tracepoint for task boost signal arm: topology: Define TC2 energy and provide it to the scheduler CHROMIUM: sched: update the average of nr_running DEBUG: schedtune: add tracepoint for schedtune_tasks_update() values DEBUG: schedtune: add tracepoint for CPU boost signal DEBUG: schedtune: add tracepoint for SchedTune configuration update DEBUG: sched: add energy procfs interface DEBUG: sched,cpufreq: add cpu_capacity change tracepoint DEBUG: sched: add tracepoint for CPU load/util signals DEBUG: sched: add tracepoint for task load/util signals DEBUG: sched: add tracepoint for cpu/freq scale invariance sched/fair: filter energy_diff() based on energy_payoff value sched/tune: add support to compute normalized energy sched/fair: keep track of energy/capacity variations sched/fair: add boosted task utilization sched/{fair,tune}: track RUNNABLE tasks impact on per CPU boost value sched/tune: compute and keep track of per CPU boost value sched/tune: add initial support for CGroups based boosting sched/fair: add boosted CPU usage sched/fair: add function to convert boost value into "margin" sched/tune: add sysctl interface to define a boost value sched/tune: add detailed documentation fixup! sched/fair: jump to max OPP when crossing UP threshold fixup! sched: scheduler-driven cpu frequency selection sched: rt scheduler sets capacity requirement sched: deadline: use deadline bandwidth in scale_rt_capacity sched: remove call of sched_avg_update from sched_rt_avg_update sched/cpufreq_sched: add trace events sched/fair: jump to max OPP when crossing UP threshold sched/fair: cpufreq_sched triggers for load balancing sched/{core,fair}: trigger OPP change request on fork() sched/fair: add triggers for OPP change requests sched: scheduler-driven cpu frequency selection cpufreq: introduce cpufreq_driver_is_slow sched: Consider misfit tasks when load-balancing sched: Add group_misfit_task load-balance type sched: Add per-cpu max capacity to sched_group_capacity sched: Do eas idle balance regardless of the rq avg idle value arm64: Enable max freq invariant scheduler load-tracking and capacity support arm: Enable max freq invariant scheduler load-tracking and capacity support sched: Update max cpu capacity in case of max frequency constraints cpufreq: Max freq invariant scheduler load-tracking and cpu capacity support arm64, topology: Updates to use DT bindings for EAS costing data sched: Support for extracting EAS energy costs from DT Documentation: DT bindings for energy model cost data required by EAS sched: Disable energy-unfriendly nohz kicks sched: Consider a not over-utilized energy-aware system as balanced sched: Energy-aware wake-up task placement sched: Determine the current sched_group idle-state sched, cpuidle: Track cpuidle state index in the scheduler sched: Add over-utilization/tipping point indicator sched: Estimate energy impact of scheduling decisions sched: Extend sched_group_energy to test load-balancing decisions sched: Calculate energy consumption of sched_group sched: Highest energy aware balancing sched_domain level pointer sched: Relocated cpu_util() and change return type sched: Compute cpu capacity available at current frequency arm64: Cpu invariant scheduler load-tracking and capacity support arm: Cpu invariant scheduler load-tracking and capacity support sched: Introduce SD_SHARE_CAP_STATES sched_domain flag sched: Initialize energy data structures sched: Introduce energy data structures sched: Make energy awareness a sched feature sched: Documentation for scheduler energy cost model sched: Prevent unnecessary active balance of single task in sched group sched: Enable idle balance to pull single task towards cpu with higher capacity sched: Consider spare cpu capacity at task wake-up sched: Add cpu capacity awareness to wakeup balancing sched: Store system-wide maximum cpu capacity in root domain arm: Update arch_scale_cpu_capacity() to reflect change to define arm64: Enable frequency invariant scheduler load-tracking support arm: Enable frequency invariant scheduler load-tracking support cpufreq: Frequency invariant scheduler load-tracking support sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task() FROMLIST: pstore: drop pmsg bounce buffer UPSTREAM: usercopy: remove page-spanning test for now UPSTREAM: usercopy: force check_object_size() inline BACKPORT: usercopy: fold builtin_const check into inline function UPSTREAM: x86/uaccess: force copy_*_user() to be inlined UPSTREAM: HID: core: prevent out-of-bound readings Android: Fix build breakages. UPSTREAM: tty: Prevent ldisc drivers from re-using stale tty fields UPSTREAM: netfilter: nfnetlink: correctly validate length of batch messages cpuset: Make cpusets restore on hotplug UPSTREAM: mm/slub: support left redzone UPSTREAM: Make the hardened user-copy code depend on having a hardened allocator Android: MMC/UFS IO Latency Histograms. UPSTREAM: usercopy: fix overlap check for kernel text UPSTREAM: usercopy: avoid potentially undefined behavior in pointer math UPSTREAM: unsafe_[get|put]_user: change interface to use a error target label BACKPORT: arm64: mm: fix location of _etext BACKPORT: ARM: 8583/1: mm: fix location of _etext BACKPORT: Don't show empty tag stats for unprivileged uids UPSTREAM: tcp: fix use after free in tcp_xmit_retransmit_queue() ANDROID: base-cfg: drop SECCOMP_FILTER config UPSTREAM: [media] xc2028: unlock on error in xc2028_set_config() UPSTREAM: [media] xc2028: avoid use after free ANDROID: base-cfg: enable SECCOMP config ANDROID: rcu_sync: Export rcu_sync_lockdep_assert RFC: FROMLIST: cgroup: reduce read locked section of cgroup_threadgroup_rwsem during fork RFC: FROMLIST: cgroup: avoid synchronize_sched() in __cgroup_procs_write() RFC: FROMLIST: locking/percpu-rwsem: Optimize readers and reduce global impact net: ipv6: Fix ping to link-local addresses. ipv6: fix endianness error in icmpv6_err ANDROID: dm: android-verity: Allow android-verity to be compiled as an independent module backporting: a brief introduce of backported feautures on 4.4 Linux 4.4.20 sysfs: correctly handle read offset on PREALLOC attrs hwmon: (iio_hwmon) fix memory leak in name attribute ALSA: line6: Fix POD sysfs attributes segfault ALSA: line6: Give up on the lock while URBs are released. ALSA: line6: Remove double line6_pcm_release() after failed acquire. ACPI / SRAT: fix SRAT parsing order with both LAPIC and X2APIC present ACPI / sysfs: fix error code in get_status() ACPI / drivers: replace acpi_probe_lock spinlock with mutex ACPI / drivers: fix typo in ACPI_DECLARE_PROBE_ENTRY macro staging: comedi: ni_mio_common: fix wrong insn_write handler staging: comedi: ni_mio_common: fix AO inttrig backwards compatibility staging: comedi: comedi_test: fix timer race conditions staging: comedi: daqboard2000: bug fix board type matching code USB: serial: option: add WeTelecom 0x6802 and 0x6803 products USB: serial: option: add WeTelecom WM-D200 USB: serial: mos7840: fix non-atomic allocation in write path USB: serial: mos7720: fix non-atomic allocation in write path USB: fix typo in wMaxPacketSize validation usb: chipidea: udc: don't touch DP when controller is in host mode USB: avoid left shift by -1 dmaengine: usb-dmac: check CHCR.DE bit in usb_dmac_isr_channel() crypto: qat - fix aes-xts key sizes crypto: nx - off by one bug in nx_of_update_msc() Input: i8042 - set up shared ps2_cmd_mutex for AUX ports Input: i8042 - break load dependency between atkbd/psmouse and i8042 Input: tegra-kbc - fix inverted reset logic btrfs: properly track when rescan worker is running btrfs: waiting on qgroup rescan should not always be interruptible fs/seq_file: fix out-of-bounds read gpio: Fix OF build problem on UM usb: renesas_usbhs: gadget: fix return value check in usbhs_mod_gadget_probe() megaraid_sas: Fix probing cards without io port mpt3sas: Fix resume on WarpDrive flash cards cdc-acm: fix wrong pipe type on rx interrupt xfers i2c: cros-ec-tunnel: Fix usage of cros_ec_cmd_xfer() mfd: cros_ec: Add cros_ec_cmd_xfer_status() helper aacraid: Check size values after double-fetch from user ARC: Elide redundant setup of DMA callbacks ARC: Call trace_hardirqs_on() before enabling irqs ARC: use correct offset in pt_regs for saving/restoring user mode r25 ARC: build: Better way to detect ISA compatible toolchain drm/i915: fix aliasing_ppgtt leak drm/amdgpu: record error code when ring test failed drm/amd/amdgpu: sdma resume fail during S4 on CI drm/amdgpu: skip TV/CV in display parsing drm/amdgpu: avoid a possible array overflow drm/amdgpu: fix amdgpu_move_blit on 32bit systems drm/amdgpu: Change GART offset to 64-bit iio: fix sched WARNING "do not call blocking ops when !TASK_RUNNING" sched/nohz: Fix affine unpinned timers mess sched/cputime: Fix NO_HZ_FULL getrusage() monotonicity regression of: fix reference counting in of_graph_get_endpoint_by_regs arm64: dts: rockchip: add reset saradc node for rk3368 SoCs mac80211: fix purging multicast PS buffer queue s390/dasd: fix hanging device after clear subchannel EDAC: Increment correct counter in edac_inc_ue_error() pinctrl/amd: Remove the default de-bounce time iommu/arm-smmu: Don't BUG() if we find aborting STEs with disable_bypass iommu/arm-smmu: Fix CMDQ error handling iommu/dma: Don't put uninitialised IOVA domains xhci: Make sure xhci handles USB_SPEED_SUPER_PLUS devices. USB: serial: ftdi_sio: add PIDs for Ivium Technologies devices USB: serial: ftdi_sio: add device ID for WICED USB UART dev board USB: serial: option: add support for Telit LE920A4 USB: serial: option: add D-Link DWM-156/A3 USB: serial: fix memleak in driver-registration error path xhci: don't dereference a xhci member after removing xhci usb: xhci: Fix panic if disconnect xhci: always handle "Command Ring Stopped" events usb/gadget: fix gadgetfs aio support. usb: gadget: fsl_qe_udc: off by one in setup_received_handle() USB: validate wMaxPacketValue entries in endpoint descriptors usb: renesas_usbhs: Use dmac only if the pipe type is bulk usb: renesas_usbhs: clear the BRDYSTS in usbhsg_ep_enable() USB: hub: change the locking in hub_activate USB: hub: fix up early-exit pathway in hub_activate usb: hub: Fix unbalanced reference count/memory leak/deadlocks usb: define USB_SPEED_SUPER_PLUS speed for SuperSpeedPlus USB3.1 devices usb: dwc3: gadget: increment request->actual once usb: dwc3: pci: add Intel Kabylake PCI ID usb: misc: usbtest: add fix for driver hang usb: ehci: change order of register cleanup during shutdown crypto: caam - defer aead_set_sh_desc in case of zero authsize crypto: caam - fix echainiv(authenc) encrypt shared descriptor crypto: caam - fix non-hmac hashes genirq/msi: Make sure PCI MSIs are activated early genirq/msi: Remove unused MSI_FLAG_IDENTITY_MAP um: Don't discard .text.exit section ACPI / CPPC: Prevent cpc_desc_ptr points to the invalid data ACPI: CPPC: Return error if _CPC is invalid on a CPU mmc: sdhci-acpi: Reduce Baytrail eMMC/SD/SDIO hangs PCI: Limit config space size for Netronome NFP4000 PCI: Add Netronome NFP4000 PF device ID PCI: Limit config space size for Netronome NFP6000 family PCI: Add Netronome vendor and device IDs PCI: Support PCIe devices with short cfg_size NVMe: Don't unmap controller registers on reset ALSA: hda - Manage power well properly for resume libnvdimm, nd_blk: mask off reserved status bits perf intel-pt: Fix occasional decoding errors when tracing system-wide vfio/pci: Fix NULL pointer oops in error interrupt setup handling virtio: fix memory leak in virtqueue_add() parisc: Fix order of EREFUSED define in errno.h arm64: Define AT_VECTOR_SIZE_ARCH for ARCH_DLINFO ALSA: usb-audio: Add quirk for ELP HD USB Camera ALSA: usb-audio: Add a sample rate quirk for Creative Live! Cam Socialize HD (VF0610) powerpc/eeh: eeh_pci_enable(): fix checking of post-request state SUNRPC: allow for upcalls for same uid but different gss service SUNRPC: Handle EADDRNOTAVAIL on connection failures tools/testing/nvdimm: fix SIGTERM vs hotplug crash uprobes/x86: Fix RIP-relative handling of EVEX-encoded instructions x86/mm: Disable preemption during CR3 read+write hugetlb: fix nr_pmds accounting with shared page tables mm: SLUB hardened usercopy support mm: SLAB hardened usercopy support s390/uaccess: Enable hardened usercopy sparc/uaccess: Enable hardened usercopy powerpc/uaccess: Enable hardened usercopy ia64/uaccess: Enable hardened usercopy arm64/uaccess: Enable hardened usercopy ARM: uaccess: Enable hardened usercopy x86/uaccess: Enable hardened usercopy x86: remove more uaccess_32.h complexity x86: remove pointless uaccess_32.h complexity x86: fix SMAP in 32-bit environments Use the new batched user accesses in generic user string handling Add 'unsafe' user access functions for batched accesses x86: reorganize SMAP handling in user space accesses mm: Hardened usercopy mm: Implement stack frame object validation mm: Add is_migrate_cma_page Linux 4.4.19 Documentation/module-signing.txt: Note need for version info if reusing a key module: Invalidate signatures on force-loaded modules dm flakey: error READ bios during the down_interval rtc: s3c: Add s3c_rtc_{enable/disable}_clk in s3c_rtc_setfreq() lpfc: fix oops in lpfc_sli4_scmd_to_wqidx_distr() from lpfc_send_taskmgmt() ACPI / EC: Work around method reentrancy limit in ACPICA for _Qxx x86/platform/intel_mid_pci: Rework IRQ0 workaround PCI: Mark Atheros AR9485 and QCA9882 to avoid bus reset MIPS: hpet: Increase HPET_MIN_PROG_DELTA and decrease HPET_MIN_CYCLES MIPS: Don't register r4k sched clock when CPUFREQ enabled MIPS: mm: Fix definition of R6 cache instruction SUNRPC: Don't allocate a full sockaddr_storage for tracing Input: elan_i2c - properly wake up touchpad on ASUS laptops target: Fix ordered task CHECK_CONDITION early exception handling target: Fix max_unmap_lba_count calc overflow target: Fix race between iscsi-target connection shutdown + ABORT_TASK target: Fix missing complete during ABORT_TASK + CMD_T_FABRIC_STOP target: Fix ordered task target_setup_cmd_from_cdb exception hang iscsi-target: Fix panic when adding second TCP connection to iSCSI session ubi: Fix race condition between ubi device creation and udev ubi: Fix early logging ubi: Make volume resize power cut aware of: fix memory leak related to safe_name() IB/mlx4: Fix memory leak if QP creation failed IB/mlx4: Fix error flow when sending mads under SRIOV IB/mlx4: Fix the SQ size of an RC QP IB/IWPM: Fix a potential skb leak IB/IPoIB: Don't update neigh validity for unresolved entries IB/SA: Use correct free function IB/mlx5: Return PORT_ERR in Active to Initializing tranisition IB/mlx5: Fix post send fence logic IB/mlx5: Fix entries check in mlx5_ib_resize_cq IB/mlx5: Fix returned values of query QP IB/mlx5: Fix entries checks in mlx5_ib_create_cq IB/mlx5: Fix MODIFY_QP command input structure ALSA: hda - Fix headset mic detection problem for two dell machines ALSA: hda: add AMD Bonaire AZ PCI ID with proper driver caps ALSA: hda/realtek - Can't adjust speaker's volume on a Dell AIO ALSA: hda: Fix krealloc() with __GFP_ZERO usage mm/hugetlb: avoid soft lockup in set_max_huge_pages() mtd: nand: fix bug writing 1 byte less than page size block: fix bdi vs gendisk lifetime mismatch block: add missing group association in bio-cloning functions metag: Fix __cmpxchg_u32 asm constraint for CMP ftrace/recordmcount: Work around for addition of metag magic but not relocations balloon: check the number of available pages in leak balloon drm/i915/dp: Revert "drm/i915/dp: fall back to 18 bpp when sink capability is unknown" drm/i915: Never fully mask the the EI up rps interrupt on SNB/IVB drm/edid: Add 6 bpc quirk for display AEO model 0. drm: Restore double clflush on the last partial cacheline drm/nouveau/fbcon: fix font width not divisible by 8 drm/nouveau/gr/nv3x: fix instobj write offsets in gr setup drm/nouveau: check for supported chipset before booting fbdev off the hw drm/radeon: support backlight control for UNIPHY3 drm/radeon: fix firmware info version checks drm/radeon: Poll for both connect/disconnect on analog connectors drm/radeon: add a delay after ATPX dGPU power off drm/amdgpu/gmc7: add missing mullins case drm/amdgpu: fix firmware info version checks drm/amdgpu: Disable RPM helpers while reprobing connectors on resume drm/amdgpu: support backlight control for UNIPHY3 drm/amdgpu: Poll for both connect/disconnect on analog connectors drm/amdgpu: add a delay after ATPX dGPU power off w1:omap_hdq: fix regression netlabel: add address family checks to netlbl_{sock,req}_delattr() ARM: dts: sunxi: Add a startup delay for fixed regulator enabled phys audit: fix a double fetch in audit_log_single_execve_arg() iommu/amd: Update Alias-DTE in update_device_table() iommu/amd: Init unity mappings only for dma_ops domains iommu/amd: Handle IOMMU_DOMAIN_DMA in ops->domain_free call-back iommu/vt-d: Return error code in domain_context_mapping_one() iommu/exynos: Suppress unbinding to prevent system failure drm/i915: Don't complain about lack of ACPI video bios nfsd: don't return an unhashed lock stateid after taking mutex nfsd: Fix race between FREE_STATEID and LOCK nfs: don't create zero-length requests MIPS: KVM: Propagate kseg0/mapped tlb fault errors MIPS: KVM: Fix gfn range check in kseg0 tlb faults MIPS: KVM: Add missing gfn range check MIPS: KVM: Fix mapped fault broken commpage handling random: add interrupt callback to VMBus IRQ handler random: print a warning for the first ten uninitialized random users random: initialize the non-blocking pool via add_hwgenerator_randomness() CIFS: Fix a possible invalid memory access in smb2_query_symlink() cifs: fix crash due to race in hmac(md5) handling cifs: Check for existing directory when opening file with O_CREAT fs/cifs: make share unaccessible at root level mountable jbd2: make journal y2038 safe ARC: mm: don't loose PTE_SPECIAL in pte_modify() remoteproc: Fix potential race condition in rproc_add ovl: disallow overlayfs as upperdir HID: uhid: fix timeout when probe races with IO EDAC: Correct channel count limit Bluetooth: Fix l2cap_sock_setsockopt() with optname BT_RCVMTU spi: pxa2xx: Clear all RFT bits in reset_sccr1() on Intel Quark i2c: efm32: fix a failure path in efm32_i2c_probe() s5p-mfc: Add release callback for memory region devs s5p-mfc: Set device name for reserved memory region devs hp-wmi: Fix wifi cannot be hard-unblocked dm: set DMF_SUSPENDED* _before_ clearing DMF_NOFLUSH_SUSPENDING sur40: fix occasional oopses on device close sur40: lower poll interval to fix occasional FPS drops to ~56 FPS Fix RC5 decoding with Fintek CIR chipset vb2: core: Skip planes array verification if pb is NULL videobuf2-v4l2: Verify planes array in buffer dequeueing media: dvb_ringbuffer: Add memory barriers media: usbtv: prevent access to free'd resources mfd: qcom_rpm: Parametrize also ack selector size mfd: qcom_rpm: Fix offset error for msm8660 intel_pstate: Fix MSR_CONFIG_TDP_x addressing in core_get_max_pstate() s390/cio: allow to reset channel measurement block KVM: nVMX: Fix memory corruption when using VMCS shadowing KVM: VMX: handle PML full VMEXIT that occurs during event delivery KVM: MTRR: fix kvm_mtrr_check_gfn_range_consistency page fault KVM: PPC: Book3S HV: Save/restore TM state in H_CEDE KVM: PPC: Book3S HV: Pull out TM state save/restore into separate procedures arm64: mm: avoid fdt_check_header() before the FDT is fully mapped arm64: dts: rockchip: fixes the gic400 2nd region size for rk3368 pinctrl: cherryview: prevent concurrent access to GPIO controllers Bluetooth: hci_intel: Fix null gpio desc pointer dereference gpio: intel-mid: Remove potentially harmful code gpio: pca953x: Fix NBANK calculation for PCA9536 tty/serial: atmel: fix RS485 half duplex with DMA serial: samsung: Fix ERR pointer dereference on deferred probe tty: serial: msm: Don't read off end of tx fifo arm64: Fix incorrect per-cpu usage for boot CPU arm64: debug: unmask PSTATE.D earlier arm64: kernel: Save and restore UAO and addr_limit on exception entry USB: usbfs: fix potential infoleak in devio usb: renesas_usbhs: fix NULL pointer dereference in xfer_work() USB: serial: option: add support for Telit LE910 PID 0x1206 usb: dwc3: fix for the isoc transfer EP_BUSY flag usb: quirks: Add no-lpm quirk for Elan usb: renesas_usbhs: protect the CFIFOSEL setting in usbhsg_ep_enable() usb: f_fs: off by one bug in _ffs_func_bind() usb: gadget: avoid exposing kernel stack UPSTREAM: usb: gadget: configfs: add mutex lock before unregister gadget ANDROID: dm-verity: adopt changes made to dm callbacks UPSTREAM: ecryptfs: fix handling of directory opening ANDROID: net: core: fix UID-based routing ANDROID: net: fib: remove duplicate assignment FROMLIST: proc: Fix timerslack_ns CAP_SYS_NICE check when adjusting self ANDROID: dm verity fec: pack the fec_header structure ANDROID: dm: android-verity: Verify header before fetching table ANDROID: dm: allow adb disable-verity only in userdebug ANDROID: dm: mount as linear target if eng build ANDROID: dm: use default verity public key ANDROID: dm: fix signature verification flag ANDROID: dm: use name_to_dev_t ANDROID: dm: rename dm-linear methods for dm-android-verity ANDROID: dm: Minor cleanup ANDROID: dm: Mounting root as linear device when verity disabled ANDROID: dm-android-verity: Rebase on top of 4.1 ANDROID: dm: Add android verity target ANDROID: dm: fix dm_substitute_devices() ANDROID: dm: Rebase on top of 4.1 CHROMIUM: dm: boot time specification of dm= Implement memory_state_time, used by qcom,cpubw Revert "panic: Add board ID to panic output" usb: gadget: f_accessory: remove duplicate endpoint alloc BACKPORT: brcmfmac: defer DPC processing during probe FROMLIST: proc: Add LSM hook checks to /proc/<tid>/timerslack_ns FROMLIST: proc: Relax /proc/<tid>/timerslack_ns capability requirements UPSTREAM: ppp: defer netns reference release for ppp channel cpuset: Add allow_attach hook for cpusets on android. UPSTREAM: KEYS: Fix ASN.1 indefinite length object parsing ANDROID: sdcardfs: fix itnull.cocci warnings android-recommended.cfg: enable fstack-protector-strong Linux 4.4.18 mm: memcontrol: fix memcg id ref counter on swap charge move mm: memcontrol: fix swap counter leak on swapout from offline cgroup mm: memcontrol: fix cgroup creation failure after many small jobs ext4: fix reference counting bug on block allocation error ext4: short-cut orphan cleanup on error ext4: validate s_reserved_gdt_blocks on mount ext4: don't call ext4_should_journal_data() on the journal inode ext4: fix deadlock during page writeback ext4: check for extents that wrap around crypto: scatterwalk - Fix test in scatterwalk_done crypto: gcm - Filter out async ghash if necessary fs/dcache.c: avoid soft-lockup in dput() fuse: fix wrong assignment of ->flags in fuse_send_init() fuse: fuse_flush must check mapping->flags for errors fuse: fsync() did not return IO errors sysv, ipc: fix security-layer leaking block: fix use-after-free in seq file x86/syscalls/64: Add compat_sys_keyctl for 32-bit userspace drm/i915: Pretend cursor is always on for ILK-style WM calculations (v2) x86/mm/pat: Fix BUG_ON() in mmap_mem() on QEMU/i386 x86/pat: Document the PAT initialization sequence x86/xen, pat: Remove PAT table init code from Xen x86/mtrr: Fix PAT init handling when MTRR is disabled x86/mtrr: Fix Xorg crashes in Qemu sessions x86/mm/pat: Replace cpu_has_pat with boot_cpu_has() x86/mm/pat: Add pat_disable() interface x86/mm/pat: Add support of non-default PAT MSR setting devpts: clean up interface to pty drivers random: strengthen input validation for RNDADDTOENTCNT apparmor: fix ref count leak when profile sha1 hash is read Revert "s390/kdump: Clear subchannel ID to signal non-CCW/SCSI IPL" KEYS: 64-bit MIPS needs to use compat_sys_keyctl for 32-bit userspace arm: oabi compat: add missing access checks cdc_ncm: do not call usbnet_link_change from cdc_ncm_bind i2c: i801: Allow ACPI SystemIO OpRegion to conflict with PCI BAR x86/mm/32: Enable full randomization on i386 and X86_32 HID: sony: do not bail out when the sixaxis refuses the output report PNP: Add Broadwell to Intel MCH size workaround PNP: Add Haswell-ULT to Intel MCH size workaround scsi: ignore errors from scsi_dh_add_device() ipath: Restrict use of the write() interface tcp: consider recv buf for the initial window scale qed: Fix setting/clearing bit in completion bitmap net/irda: fix NULL pointer dereference on memory allocation failure net: bgmac: Fix infinite loop in bgmac_dma_tx_add() bonding: set carrier off for devices created through netlink ipv4: reject RTNH_F_DEAD and RTNH_F_LINKDOWN from user space tcp: enable per-socket rate limiting of all 'challenge acks' tcp: make challenge acks less predictable arm64: relocatable: suppress R_AARCH64_ABS64 relocations in vmlinux arm64: vmlinux.lds: make __rela_offset and __dynsym_offset ABSOLUTE Linux 4.4.17 vfs: fix deadlock in file_remove_privs() on overlayfs intel_th: Fix a deadlock in modprobing intel_th: pci: Add Kaby Lake PCH-H support net: mvneta: set real interrupt per packet for tx_done libceph: apply new_state before new_up_client on incrementals libata: LITE-ON CX1-JB256-HP needs lower max_sectors i2c: mux: reg: wrong condition checked for of_address_to_resource return value posix_cpu_timer: Exit early when process has been reaped media: fix airspy usb probe error path ipr: Clear interrupt on croc/crocodile when running with LSI SCSI: fix new bug in scsi_dev_info_list string matching RDS: fix rds_tcp_init() error path can: fix oops caused by wrong rtnl dellink usage can: fix handling of unmodifiable configuration options fix can: c_can: Update D_CAN TX and RX functions to 32 bit - fix Altera Cyclone access can: at91_can: RX queue could get stuck at high bus load perf/x86: fix PEBS issues on Intel Atom/Core2 ovl: handle ATTR_KILL* sched/fair: Fix effective_load() to consistently use smoothed load mmc: block: fix packed command header endianness block: fix use-after-free in sys_ioprio_get() qeth: delete napi struct when removing a qeth device platform/chrome: cros_ec_dev - double fetch bug in ioctl clk: rockchip: initialize flags of clk_init_data in mmc-phase clock spi: sun4i: fix FIFO limit spi: sunxi: fix transfer timeout namespace: update event counter when umounting a deleted dentry 9p: use file_dentry() ext4: verify extent header depth ecryptfs: don't allow mmap when the lower fs doesn't support it Revert "ecryptfs: forbid opening files without mmap handler" locks: use file_inode() power_supply: power_supply_read_temp only if use_cnt > 0 cgroup: set css->id to -1 during init pinctrl: imx: Do not treat a PIN without MUX register as an error pinctrl: single: Fix missing flush of posted write for a wakeirq pvclock: Add CPU barriers to get correct version value Input: tsc200x - report proper input_dev name Input: xpad - validate USB endpoint count during probe Input: wacom_w8001 - w8001_MAX_LENGTH should be 13 Input: xpad - fix oops when attaching an unknown Xbox One gamepad Input: elantech - add more IC body types to the list Input: vmmouse - remove port reservation ALSA: timer: Fix leak in events via snd_timer_user_tinterrupt ALSA: timer: Fix leak in events via snd_timer_user_ccallback ALSA: timer: Fix leak in SNDRV_TIMER_IOCTL_PARAMS xenbus: don't bail early from xenbus_dev_request_and_reply() xenbus: don't BUG() on user mode induced condition xen/pciback: Fix conf_space read/write overlap check. ARC: unwind: ensure that .debug_frame is generated (vs. .eh_frame) arc: unwind: warn only once if DW2_UNWIND is disabled kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while processing sysrq-w pps: do not crash when failed to register vmlinux.lds: account for destructor sections mm, meminit: ensure node is online before checking whether pages are uninitialised mm, meminit: always return a valid node from early_pfn_to_nid mm, compaction: prevent VM_BUG_ON when terminating freeing scanner fs/nilfs2: fix potential underflow in call to crc32_le mm, compaction: abort free scanner if split fails mm, sl[au]b: add __GFP_ATOMIC to the GFP reclaim mask dmaengine: at_xdmac: double FIFO flush needed to compute residue dmaengine: at_xdmac: fix residue corruption dmaengine: at_xdmac: align descriptors on 64 bits x86/quirks: Add early quirk to reset Apple AirPort card x86/quirks: Reintroduce scanning of secondary buses x86/quirks: Apply nvidia_bugs quirk only on root bus USB: OHCI: Don't mark EDs as ED_OPER if scheduling fails Conflicts: arch/arm/kernel/topology.c arch/arm64/include/asm/arch_gicv3.h arch/arm64/kernel/topology.c block/bio.c drivers/cpufreq/Kconfig drivers/md/Makefile drivers/media/dvb-core/dvb_ringbuffer.c drivers/media/tuners/tuner-xc2028.c drivers/misc/Kconfig drivers/misc/Makefile drivers/mmc/core/host.c drivers/scsi/ufs/ufshcd.c drivers/scsi/ufs/ufshcd.h drivers/usb/dwc3/gadget.c drivers/usb/gadget/configfs.c fs/ecryptfs/file.c include/linux/mmc/core.h include/linux/mmc/host.h include/linux/mmzone.h include/linux/sched.h include/linux/sched/sysctl.h include/trace/events/power.h include/trace/events/sched.h init/Kconfig kernel/cpuset.c kernel/exit.c kernel/sched/Makefile kernel/sched/core.c kernel/sched/cputime.c kernel/sched/fair.c kernel/sched/features.h kernel/sched/rt.c kernel/sched/sched.h kernel/sched/stop_task.c kernel/sched/tune.c lib/Kconfig.debug mm/Makefile mm/vmstat.c Change-Id: I243a43231ca56a6362076fa6301827e1b0493be5 Signed-off-by: Runmin Wang <runminw@codeaurora.org>
| * | sched/rt: Add Kconfig option to enable panicking for RT throttlingMatt Wagantall2016-09-14
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This may be useful for detecting and debugging RT throttling issues. Change-Id: I5807a897d11997d76421c1fcaa2918aad988c6c9 Signed-off-by: Matt Wagantall <mattw@codeaurora.org> [rameezmustafa@codeaurora.org]: Port to msm-3.18] Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> [jstultz: forwardported to 4.4] Signed-off-by: John Stultz <john.stultz@linaro.org>
| * | sched/rt: print RT tasks when RT throttling is activatedMatt Wagantall2016-09-14
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Existing debug prints do not provide any clues about which tasks may have triggered RT throttling. Print the names and PIDs of all tasks on the throttled rt_rq to help narrow down the source of the problem. Change-Id: I180534c8a647254ed38e89d0c981a8f8bccd741c Signed-off-by: Matt Wagantall <mattw@codeaurora.org> [rameezmustafa@codeaurora.org]: Port to msm-3.18] Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
| * | sched: Introduce Window Assisted Load Tracking (WALT)Srivatsa Vaddagiri2016-09-14
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | use a window based view of time in order to track task demand and CPU utilization in the scheduler. Window Assisted Load Tracking (WALT) implementation credits: Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park, Pavan Kumar Kondeti, Olav Haugan 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla and Todd Kjos Change-Id: I21408236836625d4e7d7de1843d20ed5ff36c708 Includes fixes for issues: eas/walt: Use walt_ktime_clock() instead of ktime_get_ns() to avoid a race resulting in watchdog resets BUG: 29353986 Change-Id: Ic1820e22a136f7c7ebd6f42e15f14d470f6bbbdb Handle walt accounting anomoly during resume During resume, there is a corner case where on wakeup, a task's prev_runnable_sum can go negative. This is a workaround that fixes the condition and warns (instead of crashing). BUG: 29464099 Change-Id: I173e7874324b31a3584435530281708145773508 Signed-off-by: Todd Kjos <tkjos@google.com> Signed-off-by: Srinath Sridharan <srinathsr@google.com> Signed-off-by: Juri Lelli <juri.lelli@arm.com> [jstultz: fwdported to 4.4] Signed-off-by: John Stultz <john.stultz@linaro.org>
| * | sched: rt scheduler sets capacity requirementVincent Guittot2016-09-14
| |/ | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | RT tasks don't provide any running constraints like deadline ones except their running priority. The only current usable input to estimate the capacity needed by RT tasks is the rt_avg metric. We use it to estimate the CPU capacity needed for the RT scheduler class. In order to monitor the evolution for RT task load, we must peridiocally check it during the tick. Then, we use the estimated capacity of the last activity to estimate the next one which can not be that accurate but is a good starting point without any impact on the wake up path of RT tasks. Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> Signed-off-by: Steve Muckle <smuckle@linaro.org>
* | sched: Ensure proper task migration when a CPU is isolatedSyed Rameez Mustafa2016-12-09
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | migrate_tasks() migrates all tasks of a CPU by using pick_next_task(). This works in the hotplug case as we force migrate every single task allowing pick_next_task() to return a new task on every loop iteration. In the case of isolation, however, task migration is not guaranteed which causes pick_next_task() to keep returning the same task over and over again until we terminate the loop without having migrated all the tasks that were supposed to migrated. Fix the above problem by temporarily dequeuing tasks that are pinned and marking them with TASK_ON_RQ_MIGRATING. This not only allows pick_next_task() to properly walk the runqueue but also prevents any migrations or changes in affinity for the dequeued tasks. Once we are done with migrating all possible tasks, we re-enqueue all the dequeued tasks. While at it, ensure consistent ordering between task de-activation and setting the TASK_ON_RQ_MIGRATING flag across all scheduling classes. Change-Id: Id06151a8e34edab49ac76b4bffd50c132f0b792f Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
* | sched/hmp: Enhance co-location and scheduler boost featuresSyed Rameez Mustafa2016-11-16
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The recent introduction of the schedtune cgroup controller has provided the scheduler with added flexibility in terms of some of it's placement features. In particular each cgroup under the schedtune controller can now specify: 1) Whether it needs co-location along with other cgroups 2) Whether it is eligible for scheduler boost (sched_boost_enabled) 3) Whether the kernel can override the boost eligibility when necessary (sched_boost_no_override) The scheduler now creates a reserved co-location group at boot. This group is used to co-locate all tasks that form part of any one of the cgroups that have co-location enabled. This reserved group can neither be destroyed nor reused for other purposes. Furthermore, cgroups are only allowed to indicate their co-location preference once at boot. Further updates are disallowed. Since we are now creating co-location groups for an extended period of time, there are a few other factors to consider when determining the preferred cluster for the group. We first exclude any tasks in the group that have not been observed to be running for a significant amount of time. Secondly we introduce the notion of group up and down migrate tunables to allow different migration policies than individual tasks. Lastly we break co-location if a single task in a group exceeds up-migrate but the total load of the group does not exceed group up-migrate. In terms of sched_boost, the scheduler now supports multiple types of boost. These are: 1) FULL_THROTTLE : Force up-migrate tasks belonging any cgroup that has the sched_boost_enabled flag turned on. Little CPUs will only be used when big CPUs can no longer accommodate tasks. Also up-migrate all RT tasks. 2) CONSERVATIVE : Override the sched_boost_enabled flag for all cgroups except those that have the sched_boost_no_override flag set. Force up-migrate all tasks belonging to only those cgroups that still remain eligible for boost. RT tasks do not get force up migrated. 3) RESTRAINED : Start frequency aggregation for co-located tasks. This type of boost does not force up-migrate any task. Finally the boost API removes ref-counting. This means that there can only be a single entity using boost at any given time. If multiple entities are managing boost, they are required to be well behaved so that they don't interfere with one another. Even for a single client, it is not possible to switch directly from one boost type to another. Boost must be first turned off before switching over to a new type. Change-Id: I8d224a70cbef162f27078b62b73acaa22670861d Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
* | sched: add cpu isolation supportOlav Haugan2016-09-24
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | This adds cpu isolation APIs to the scheduler to isolate and unisolate CPUs. Isolating and unisolating a CPU can be used in place of hotplug. Isolating and unisolating a CPU is faster than hotplug and can thus be used to optimize the performance and power of multi-core CPUs. Isolating works by migrating non-pinned IRQs and tasks to other CPUS and marking the CPU as not available to the scheduler and load balancer. Pinned tasks and IRQs are still allowed to run but it is expected that this would be minimal. Unisolation works by just marking the CPU available for scheduler and load balancer. Change-Id: I0bbddb56238c2958c5987877c5bfc3e79afa67cc Signed-off-by: Olav Haugan <ohaugan@codeaurora.org>
* | sched: let sched_boost take precedence over sched_restrict_cluster_spillPavankumar Kondeti2016-03-23
| | | | | | | | | | | | | | | | | | | | | | When sched_restrict_cluster_spill knob is enabled, RT tasks are restricted to lower power cluster. This knob also restricts inter cluster no-hz kicks. Ignore this knob setting when sched_boost is enabled so that tasks are placed on CPUs with highest spare capacity. CRs-Fixed: 968852 Change-Id: I01b3fc10b39dc834a733d64c2ee29c308d7ff730 Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
* | sched: Add separate load tracking histogram to predict loadsPavankumar Kondeti2016-03-23
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Current window based load tracking only saves history for five windows. A historically heavy task's heavy load will be completely forgotten after five windows of light load. Even before the five window expires, a heavy task wakes up on same CPU it used to run won't trigger any frequency change until end of the window. It would starve for the entire window. It also adds one "small" load window to history because it's accumulating load at a low frequency, further reducing the tracked load for this heavy task. Ideally, scheduler should be able to identify such tasks and notify governor to increase frequency immediately after it wakes up. Add a histogram for each task to track a much longer load history. A prediction will be made based on runtime of previous or current window, histogram data and load tracked in recent windows. Prediction of all tasks that is currently running or runnable on a CPU is aggregated and reported to CPUFreq governor in sched_get_cpus_busy(). sched_get_cpus_busy() now returns predicted busy time in addition to previous window busy time and new task busy time, scaled to the CPU maximum possible frequency. Tunables: - /proc/sys/kernel/sched_gov_alert_freq (KHz) This tunable can be used to further filter the notifications. Frequency alert notification is sent only when the predicted load exceeds previous window load by sched_gov_alert_freq converted to load. Change-Id: If29098cd2c5499163ceaff18668639db76ee8504 Suggested-by: Saravana Kannan <skannan@codeaurora.org> Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org> Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org> Signed-off-by: Junjie Wu <junjiew@codeaurora.org> [joonwoop@codeaurora.org: fixed merge conflicts around __migrate_task() and removed changes for CONFIG_SCHED_QHMP.]
* | sched: Provide a facility to restrict RT tasks to lower power clusterPavankumar Kondeti2016-03-23
| | | | | | | | | | | | | | | | | | | | The current CPU selection algorithm for RT tasks looks for the least loaded CPU in all clusters. Stop the search at the lowest possible power cluster based on "sched_restrict_cluster_spill" sysctl tunable. Change-Id: I34fdaefea56e0d1b7e7178d800f1bb86aa0ec01c Signed-off-by: Pavankumar Kondeti <pkondeti@codeaurora.org>
* | sched: Update fair and rt placement logic to use scheduler clustersSrivatsa Vaddagiri2016-03-23
| | | | | | | | | | | | | | | | | | | | | | | | | | | | Make use of clusters in the fair and rt scheduling classes. This is needed as the freq domain mask can no longer be used to do correct task placement. The freq domain mask was being used to demarcate clusters. Change-Id: I57f74147c7006f22d6760256926c10fd0bf50cbd Signed-off-by: Srivatsa Vaddagiri <vatsa@codeaurora.org> Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> [joonwoop@codeaurora.org: fixed merge conflicts due to omitted changes for CONFIG_SCHED_QHMP.] Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
* | sched: Optimize scheduler trace events to reduce trace buffer usageSyed Rameez Mustafa2016-03-23
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Scheduler ftrace events currently generate a lot of data when turned on. The excessive log messages often end up overflowing trace buffers for long use cases or crowding out other events. Optimize scheduler events so that the log spew is less and more manageable. To that end change the variable type for some event fields; introduce variants of sched_cpu_load that can be turned on/off for separate code paths and remove unused fields from various events. Change-Id: I2b313542b39ad5e09a01ad1303b5dfe2c4883b8a Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> [joonwoop@codeaurora.org: fixed conflict in rt.c due to CONFIG_SCHED_QHMP.] Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
* | sched: add preference for prev and sibling CPU in RT task placementJoonwoo Park2016-03-23
| | | | | | | | | | | | | | | | | | Add a bias towards the RT task's previous CPU and sibling CPUs in order to avoid cache bouncing and migrations. CRs-fixed: 927903 Change-Id: I45d79d774e65efcb38282130b6692b4c3b03c2f0 Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
* | sched: precompute required frequency for CPU loadJoonwoo Park2016-03-23
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | At present in order to estimate power cost of CPU load, HMP scheduler converts CPU load to coresponding frequency on the fly which can be avoided. Optimize and reduce execution time of select_best_cpu() by precomputing CPU load to frequency conversion. This optimization reduces about ~20% of execution time of select_best_cpu() on average. Change-Id: I385c57f2ea9a50883b76ba6ca3deb673b827217f [joonwoop@codeaurora.org: fixed minior conflict in kernel/sched/sched.h. stripped out codes for CONFIG_SCHED_QHMP.] Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
* | sched: clean up fixup_hmp_sched_stats()Joonwoo Park2016-03-23
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The commit 392edf4969d20 ("sched: avoid stale cumulative_runnable_avg HMP statistics) introduced the callback function fixup_hmp_sched_stats() so update_history() can avoid decrement and increment pair of HMP stat. However the commit also made fixup function to do obscure p->ravg.demand update which isn't the cleanest way. Revise the function fixup_hmp_sched_stats() so the caller can update p->ravg.demand directly. Change-Id: Id54667d306495d2109c26362813f80f08a1385ad [joonwoop@codeaurora.org: stripped out CONFIG_SCHED_QHMP.] Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
* | sched: Update task->on_rq when tasks are moving between runqueuesOlav Haugan2016-03-23
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Task->on_rq has three states: 0 - Task is not on runqueue (rq) 1 (TASK_ON_RQ_QUEUED) - Task is on rq 2 (TASK_ON_RQ_MIGRATING) - Task is on rq but in the process of being migrated to another rq When a task is moving between rqs task->on_rq state should be TASK_ON_RQ_MIGRATING CRs-fixed: 884720 Change-Id: I1572aba00a0273d4ad5bc9a3dd60fb68e2f0b895 Signed-off-by: Olav Haugan <ohaugan@codeaurora.org>
* | sched: Update the wakeup placement logic for fair and rt tasksSyed Rameez Mustafa2016-03-23
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | For the fair sched class, update the select_best_cpu() policy to do power based placement. The hope is to minimize the voltage at which the CPU runs. While RT tasks already do power based placement, their placement preference has to now take into account the power cost of all tasks on a given CPU. Also remove the check for sched_boost since sched_boost no longer intends to elevate all tasks to the highest capacity cluster. Change-Id: Ic6a7625c97d567254d93b94cec3174a91727cb87 Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
* | sched: avoid stale cumulative_runnable_avg HMP statisticsJoonwoo Park2016-03-23
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | When a new window starts for a task and the task is on a rq, scheduler decreases rq's cumulative_runnable_avg momentarily, re-account task's demand and increases rq's cumulative_runnable_avg with newly accounted task's demand. Therefore there is short time period that rq's cumulative_runnable_avg is less than what it's supposed to be. Meanwhile, there is chance that other CPU is in search of best CPU to place a task and makes suboptimal decision with momentarily stale cumulative_runnable_avg. Fix such issue by adding or subtracting of delta between task's old and new demand instead of decrementing and incrementing of entire task's load. Change-Id: I3c9329961e6f96e269fa13359e7d1c39c4973ff2 Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
* | sched: Add load based placement for RT tasksSyed Rameez Mustafa2016-03-23
| | | | | | | | | | | | | | | | | | | | Currently RT tasks prefer to go to the lowest power CPU in the system. This can end up causing contention on the lowest power CPU. Instead ensure that RT tasks end up on the lowest power cluster and the least loaded CPU within that cluster. Change-Id: I363b3d43236924962c67d2fb5d3d2d09800cd994 Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
* | sched: Consolidate hmp stats into their own structSrivatsa Vaddagiri2016-03-23
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Key hmp stats (nr_big_tasks, nr_small_tasks and cumulative_runnable_average) are currently maintained per-cpu in 'struct rq'. Merge those stats in their own structure (struct hmp_sched_stats) and modify impacted functions to deal with the newly introduced structure. This cleanup is required for a subsequent patch which fixes various issues with use of CFS_BANDWIDTH feature in HMP scheduler. Change-Id: Ieffc10a3b82a102f561331bc385d042c15a33998 Signed-off-by: Srivatsa Vaddagiri <vatsa@codeaurora.org> [rameezmustafa@codeaurora.org: Port to msm-3.18] Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> [joonwoop@codeaurora.org: fixed conflict in __update_load_avg().] Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
* | sched: Add temperature to cpu_load trace pointOlav Haugan2016-03-23
| | | | | | | | | | | | | | | | | | Add the current CPU temperature to the sched_cpu_load trace point. This will allow us to track the CPU temperature. CRs-Fixed: 764788 Change-Id: Ib2e3559bbbe3fe07a6b7c8115db606828bc36254 Signed-off-by: Olav Haugan <ohaugan@codeaurora.org>
* | sched: trace: extend sched_cpu_load to print irqloadSteve Muckle2016-03-23
| | | | | | | | | | | | | | | | The irqload is used in determining whether CPUs are mostly idle so it is useful to know this value while viewing scheduler traces. Change-Id: Icbb74fc1285be878f254ae54886bdb161b14a270 Signed-off-by: Steve Muckle <smuckle@codeaurora.org>
* | sched: avoid CPUs with high irq activitySteve Muckle2016-03-23
| | | | | | | | | | | | | | | | | | CPUs with significant IRQ activity will not be able to serve tasks quickly. Avoid them if possible by disqualifying such CPUs from being recognized as mostly idle. Change-Id: I2c09272a4f259f0283b272455147d288fce11982 Signed-off-by: Steve Muckle <smuckle@codeaurora.org>
* | sched: Make RT tasks eligible for boostSyed Rameez Mustafa2016-03-23
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | During sched boost RT tasks currently end up going to the lowest power cluster. This can be a performance bottleneck especially if the frequency and IPC differences between clusters are high. Furthermore, when RT tasks go over to the little cluster during boost, the load balancer keeps attempting to pull work over to the big cluster. This results in pre-emption of the executing RT task causing more delays. Finally, containing more work on a single cluster during boost might help save some power if the little cluster can then enter deeper low power modes. Change-Id: I177b2e81be5657c23e7ac43889472561ce9993a9 Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
* | sched: remove sysctl control for HMP and power-aware task placementSrivatsa Vaddagiri2016-03-23
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | There is no real need to control HMP and power-aware task placement at runtime after kernel has booted. Boot-time control should be sufficient. Not allowing for runtime (sysctl) support simplifies the code quite a bit. Also rename sysctl_sched_enable_hmp_task_placement to be shorter. Change-Id: I60cae51a173c6f73b79cbf90c50ddd41a27604aa Signed-off-by: Srivatsa Vaddagiri <vatsa@codeaurora.org> Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> [joonwoop@codeaurora.org: fixed minor conflict. p->nr_cpus_allowed == 1 has moved to core.c Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
* | sched: support legacy mode betterSrivatsa Vaddagiri2016-03-23
| | | | | | | | | | | | | | | | | | | | | | | | | | | | It should be possible to bypass all HMP scheduler changes at runtime by setting sysctl_sched_enable_hmp_task_placement and sysctl_sched_enable_power_aware to 0. Fix various code paths to honor this requirement. Change-Id: I74254e68582b3f9f1b84661baf7dae14f981c025 Signed-off-by: Srivatsa Vaddagiri <vatsa@codeaurora.org> Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> [joonwoop@codeaurora.org: fixed conflict in rt.c, p->nr_cpus_allowed == 1 is now moved in core.c] Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
* | sched/rt: Introduce power aware scheduling for real time tasksSyed Rameez Mustafa2016-03-23
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Real Time task scheduling has historically been geared towards performance with a significant attempt to keep higher priority tasks on the same CPU. This is not optimal for power since the task CPU may not be the most power efficient CPU. Also task movement via select_lowest_rq() gives CPU priority the primary consideration before looking at CPU topologies to find a CPU closest to the task CPU in terms of topology. This again is not optimal for power since the closest CPU may be significantly worse for power than CPUs further away. This patch removes any bias for the task CPU. When the lowest priority CPUs in the system are found we give no consideration to the CPU topology. Instead we find the lowest power CPU within local_cpu_mask. This takes care of select_task_rq_rt() and push_task(). The pull model remains unaffected since we have no room for power optimization there. Change-Id: I4162ebe2f74be14240e62476f231f9e4a18bd9e8 Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> Signed-off-by: Srivatsa Vaddagiri <vatsa@codeaurora.org> [joonwoop@codeaurora.org: s/__get_cpu_var/this_cpu_ptr/] Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
* | sched/rt: Add Kconfig option to enable panicking for RT throttlingMatt Wagantall2016-03-23
| | | | | | | | | | | | | | | | | | | | | | This may be useful for detecting and debugging RT throttling issues. Change-Id: I5807a897d11997d76421c1fcaa2918aad988c6c9 Signed-off-by: Matt Wagantall <mattw@codeaurora.org> [rameezmustafa@codeaurora.org]: Port to msm-3.18] Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org> [joonwoop@codeaurora.org: fixed conflict in lib/Kconfig.debug] Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
* | sched/rt: print RT tasks when RT throttling is activatedMatt Wagantall2016-03-23
|/ | | | | | | | | | | | Existing debug prints do not provide any clues about which tasks may have triggered RT throttling. Print the names and PIDs of all tasks on the throttled rt_rq to help narrow down the source of the problem. Change-Id: I180534c8a647254ed38e89d0c981a8f8bccd741c Signed-off-by: Matt Wagantall <mattw@codeaurora.org> [rameezmustafa@codeaurora.org]: Port to msm-3.18] Signed-off-by: Syed Rameez Mustafa <rameezmustafa@codeaurora.org>
* sched/rt: Hide the push_irq_work_func() declarationArnd Bergmann2015-11-23
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | The push_irq_work_func() function is conditionally defined only when both CONFIG_SMP and HAVE_RT_PUSH_IPI are defined, but the forward declaration remains visibile without HAVE_RT_PUSH_IPI, causing a gcc warning in ARM64 allnoconfig: kernel/sched/rt.c:68:13: warning: 'push_irq_work_func' declared 'static' but never defined [-Wunused-function] This changes the code to use the same condition for both the declaration and the function definition, which gets rid of the warning. As Peter Zijlstra, we can possibly get rid of the whole HAVE_RT_PUSH_IPI thing after: 8053871d0f7f ("smp: Fix smp_call_function_single_async() locking") Until that is done, this patch can be used to avoid the warning. Signed-off-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Steven Rostedt <rostedt@goodmis.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Fixes: b6366f048e0c ("sched/rt: Use IPI to trigger RT task push migration instead of pulling") Link: http://lkml.kernel.org/r/3828565.oKfGk7yNIT@wuerfel Signed-off-by: Ingo Molnar <mingo@kernel.org>
* sched/rt: Make (do_)balance_runtime() return voidJuri Lelli2015-09-23
| | | | | | | | | | | | | | The return value of (do_)balance_runtime() is not consumed by anybody. Make them return void. Signed-off-by: Juri Lelli <juri.lelli@arm.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1441188096-23021-5-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
* sched: Change the sched_class::set_cpus_allowed() calling contextPeter Zijlstra2015-08-12
| | | | | | | | | | | | | | | | | | | | | | Change the calling context of sched_class::set_cpus_allowed() such that we can assume the task is inactive. This allows us to easily make changes that affect accounting done by enqueue/dequeue. This does in fact completely remove set_cpus_allowed_rt() and greatly reduces set_cpus_allowed_dl(). Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: dedekind1@gmail.com Cc: juri.lelli@arm.com Cc: mgorman@suse.de Cc: riel@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20150515154833.667516139@infradead.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
* sched: Make sched_class::set_cpus_allowed() unconditionalPeter Zijlstra2015-08-12
| | | | | | | | | | | | | | | | | | | Give every class a set_cpus_allowed() method, this enables some small optimization in the RT,DL implementation by avoiding a double cpumask_weight() call. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: dedekind1@gmail.com Cc: juri.lelli@arm.com Cc: mgorman@suse.de Cc: riel@redhat.com Cc: rostedt@goodmis.org Link: http://lkml.kernel.org/r/20150515154833.614517487@infradead.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
* sched/rt: Remove a redundant condition from task_woken_rt()Xunlei Pang2015-08-03
| | | | | | | | | | | | | | | | | 'p' has been already queued at this point, so "!task_running(rq, p)" and "p->nr_cpus_allowed > 1" imply that "has_pushable_tasks(rq)" is true, so it can be removed. Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Steven Rostedt <rostedt@goodmis.org> Cc: Juri Lelli <juri.lelli@gmail.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1435995563-3723-1-git-send-email-xlpang@126.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
* sched,lockdep: Employ lock pinningPeter Zijlstra2015-06-19
| | | | | | | | | | | | | | | | Employ the new lockdep lock pinning annotation to ensure no 'accidental' lock-breaks happen with rq->lock. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: oleg@redhat.com Cc: wanpeng.li@linux.intel.com Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150611124744.003233193@infradead.org Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
* sched, rt: Convert switched_{from, to}_rt() / prio_changed_rt() to balance ↵Peter Zijlstra2015-06-19
| | | | | | | | | | | | | | | | | | | | | | callbacks Remove the direct {push,pull} balancing operations from switched_{from,to}_rt() / prio_changed_rt() and use the balance callback queue. Again, err on the side of too many reschedules; since too few is a hard bug while too many is just annoying. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: oleg@redhat.com Cc: wanpeng.li@linux.intel.com Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150611124742.766832367@infradead.org Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
* sched,rt: Remove return value from pull_rt_task()Peter Zijlstra2015-06-19
| | | | | | | | | | | | | | | | | | | | | | In order to be able to use pull_rt_task() from a callback, we need to do away with the return value. Since the return value indicates if we should reschedule, do this inside the function. Since not all callers currently do this, this can increase the number of reschedules due rt balancing. Too many reschedules is not a correctness issues, too few are. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: oleg@redhat.com Cc: wanpeng.li@linux.intel.com Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150611124742.679002000@infradead.org Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
* sched: Replace post_schedule with a balance callback listPeter Zijlstra2015-06-19
| | | | | | | | | | | | | | | | | Generalize the post_schedule() stuff into a balance callback list. This allows us to more easily use it outside of schedule() and cross sched_class. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: ktkhai@parallels.com Cc: rostedt@goodmis.org Cc: juri.lelli@gmail.com Cc: pang.xunlei@linaro.org Cc: oleg@redhat.com Cc: wanpeng.li@linux.intel.com Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150611124742.424032725@infradead.org Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
* Merge branch 'timers/core' into sched/hrtimersThomas Gleixner2015-06-19
|\ | | | | | | | | Merge sched/core and timers/core so we can apply the sched balancing patch queue, which depends on both.
| * sched,perf: Fix periodic timersPeter Zijlstra2015-05-18
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | In the below two commits (see Fixes) we have periodic timers that can stop themselves when they're no longer required, but need to be (re)-started when their idle condition changes. Further complications is that we want the timer handler to always do the forward such that it will always correctly deal with the overruns, and we do not want to race such that the handler has already decided to stop, but the (external) restart sees the timer still active and we end up with a 'lost' timer. The problem with the current code is that the re-start can come before the callback does the forward, at which point the forward from the callback will WARN about forwarding an enqueued timer. Now, conceptually its easy to detect if you're before or after the fwd by comparing the expiration time against the current time. Of course, that's expensive (and racy) because we don't have the current time. Alternatively one could cache this state inside the timer, but then everybody pays the overhead of maintaining this extra state, and that is undesired. The only other option that I could see is the external timer_active variable, which I tried to kill before. I would love a nicer interface for this seemingly simple 'problem' but alas. Fixes: 272325c4821f ("perf: Fix mux_interval hrtimer wreckage") Fixes: 77a4d1a1b9a1 ("sched: Cleanup bandwidth timers") Cc: pjt@google.com Cc: tglx@linutronix.de Cc: klamm@yandex-team.ru Cc: mingo@kernel.org Cc: bsegall@google.com Cc: hpa@zytor.com Cc: Sasha Levin <sasha.levin@oracle.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: http://lkml.kernel.org/r/20150514102311.GX21418@twins.programming.kicks-ass.net
| * sched: Cleanup bandwidth timersPeter Zijlstra2015-04-22
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Roman reported a 3 cpu lockup scenario involving __start_cfs_bandwidth(). The more I look at that code the more I'm convinced its crack, that entire __start_cfs_bandwidth() thing is brain melting, we don't need to cancel a timer before starting it, *hrtimer_start*() will happily remove the timer for you if its still enqueued. Removing that, removes a big part of the problem, no more ugly cancel loop to get stuck in. So now, if I understand things right, the entire reason you have this cfs_b->lock guarded ->timer_active nonsense is to make sure we don't accidentally lose the timer. It appears to me that it should be possible to guarantee that same by unconditionally (re)starting the timer when !queued. Because regardless what hrtimer::function will return, if we beat it to (re)enqueue the timer, it doesn't matter. Now, because hrtimers don't come with any serialization guarantees we must ensure both handler and (re)start loop serialize their access to the hrtimer to avoid both trying to forward the timer at the same time. Update the rt bandwidth timer to match. This effectively reverts: 09dc4ab03936 ("sched/fair: Fix tg_set_cfs_bandwidth() deadlock on rq->lock"). Reported-by: Roman Gushchin <klamm@yandex-team.ru> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Ben Segall <bsegall@google.com> Cc: Paul Turner <pjt@google.com> Link: http://lkml.kernel.org/r/20150415095011.804589208@infradead.org Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
* | sched, timer: Convert usages of ACCESS_ONCE() in the scheduler to ↵Jason Low2015-05-08
|/ | | | | | | | | | | | | | | | | | | | | | | | | | | | | | READ_ONCE()/WRITE_ONCE() ACCESS_ONCE doesn't work reliably on non-scalar types. This patch removes the rest of the existing usages of ACCESS_ONCE() in the scheduler, and use the new READ_ONCE() and WRITE_ONCE() APIs as appropriate. Signed-off-by: Jason Low <jason.low2@hp.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Rik van Riel <riel@redhat.com> Acked-by: Waiman Long <Waiman.Long@hp.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Aswin Chandramouleeswaran <aswin@hp.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Mike Galbraith <umgwanakikbuti@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com> Cc: Scott J Norton <scott.norton@hp.com> Cc: Steven Rostedt <rostedt@goodmis.org> Link: http://lkml.kernel.org/r/1430251224-5764-2-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
* sched/core: Remove unused argument from init_[rt|dl]_rq()Abel Vesa2015-04-02
| | | | | | | | | | Obviously, 'rq' is not used in these two functions, therefore, there is no reason for it to be passed as an argument. Signed-off-by: Abel Vesa <abelvesa@gmail.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: http://lkml.kernel.org/r/1425383427-26244-1-git-send-email-abelvesa@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
* sched/rt: Use IPI to trigger RT task push migration instead of pullingSteven Rostedt2015-03-23
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | When debugging the latencies on a 40 core box, where we hit 300 to 500 microsecond latencies, I found there was a huge contention on the runqueue locks. Investigating it further, running ftrace, I found that it was due to the pulling of RT tasks. The test that was run was the following: cyclictest --numa -p95 -m -d0 -i100 This created a thread on each CPU, that would set its wakeup in iterations of 100 microseconds. The -d0 means that all the threads had the same interval (100us). Each thread sleeps for 100us and wakes up and measures its latencies. cyclictest is maintained at: git://git.kernel.org/pub/scm/linux/kernel/git/clrkwllms/rt-tests.git What happened was another RT task would be scheduled on one of the CPUs that was running our test, when the other CPU tests went to sleep and scheduled idle. This caused the "pull" operation to execute on all these CPUs. Each one of these saw the RT task that was overloaded on the CPU of the test that was still running, and each one tried to grab that task in a thundering herd way. To grab the task, each thread would do a double rq lock grab, grabbing its own lock as well as the rq of the overloaded CPU. As the sched domains on this box was rather flat for its size, I saw up to 12 CPUs block on this lock at once. This caused a ripple affect with the rq locks especially since the taking was done via a double rq lock, which means that several of the CPUs had their own rq locks held while trying to take this rq lock. As these locks were blocked, any wakeups or load balanceing on these CPUs would also block on these locks, and the wait time escalated. I've tried various methods to lessen the load, but things like an atomic counter to only let one CPU grab the task wont work, because the task may have a limited affinity, and we may pick the wrong CPU to take that lock and do the pull, to only find out that the CPU we picked isn't in the task's affinity. Instead of doing the PULL, I now have the CPUs that want the pull to send over an IPI to the overloaded CPU, and let that CPU pick what CPU to push the task to. No more need to grab the rq lock, and the push/pull algorithm still works fine. With this patch, the latency dropped to just 150us over a 20 hour run. Without the patch, the huge latencies would trigger in seconds. I've created a new sched feature called RT_PUSH_IPI, which is enabled by default. When RT_PUSH_IPI is not enabled, the old method of grabbing the rq locks and having the pulling CPU do the work is implemented. When RT_PUSH_IPI is enabled, the IPI is sent to the overloaded CPU to do a push. To enabled or disable this at run time: # mount -t debugfs nodev /sys/kernel/debug # echo RT_PUSH_IPI > /sys/kernel/debug/sched_features or # echo NO_RT_PUSH_IPI > /sys/kernel/debug/sched_features Update: This original patch would send an IPI to all CPUs in the RT overload list. But that could theoretically cause the reverse issue. That is, there could be lots of overloaded RT queues and one CPU lowers its priority. It would then send an IPI to all the overloaded RT queues and they could then all try to grab the rq lock of the CPU lowering its priority, and then we have the same problem. The latest design sends out only one IPI to the first overloaded CPU. It tries to push any tasks that it can, and then looks for the next overloaded CPU that can push to the source CPU. The IPIs stop when all overloaded CPUs that have pushable tasks that have priorities greater than the source CPU are covered. In case the source CPU lowers its priority again, a flag is set to tell the IPI traversal to restart with the first RT overloaded CPU after the source CPU. Parts-suggested-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Joern Engel <joern@purestorage.com> Cc: Clark Williams <williams@redhat.com> Cc: Mike Galbraith <umgwanakikbuti@gmail.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/20150318144946.2f3cc982@gandalf.local.home Signed-off-by: Ingo Molnar <mingo@kernel.org>
* sched/rt: Reduce rq lock contention by eliminating locking of non-feasible ↵Tim Chen2015-01-30
| | | | | | | | | | | | | | | | | | | | target This patch adds checks that prevens futile attempts to move rt tasks to a CPU with active tasks of equal or higher priority. This reduces run queue lock contention and improves the performance of a well known OLTP benchmark by 0.7%. Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Shawn Bohrer <sbohrer@rgmadvisors.com> Cc: Suruchi Kadu <suruchi.a.kadu@intel.com> Cc: Doug Nelson<doug.nelson@intel.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/1421430374.2399.27.camel@schen9-desk2.jf.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
* sched/core: Rework rq->clock update skipsPeter Zijlstra2015-01-14
| | | | | | | | | | | | | | | | | | | | The original purpose of rq::skip_clock_update was to avoid 'costly' clock updates for back to back wakeup-preempt pairs. The big problem with it has always been that the rq variable is unaware of the context and causes indiscrimiate clock skips. Rework the entire thing and create a sense of context by only allowing schedule() to skip clock updates. (XXX can we measure the cost of the added store?) By ensuring only schedule can ever skip an update, we guarantee we're never more than 1 tick behind on the update. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: umgwanakikbuti@gmail.com Link: http://lkml.kernel.org/r/20150105103554.432381549@infradead.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
* sched: Move p->nr_cpus_allowed check to select_task_rq()Wanpeng Li2014-11-16
| | | | | | | | | | | | | | Move the p->nr_cpus_allowed check into kernel/sched/core.c: select_task_rq(). This change will make fair.c, rt.c, and deadline.c all start with the same logic. Suggested-and-Acked-by: Steven Rostedt <rostedt@goodmis.org> Signed-off-by: Wanpeng Li <wanpeng.li@linux.intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: "pang.xunlei" <pang.xunlei@linaro.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/1415150077-59053-1-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
* Merge branch 'sched/urgent' into sched/core, to pick up fixes before ↵Ingo Molnar2014-11-16
|\ | | | | | | | | | | applying more changes Signed-off-by: Ingo Molnar <mingo@kernel.org>
| * sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistencyStanislaw Gruszka2014-11-16
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | Commit d670ec13178d0 "posix-cpu-timers: Cure SMP wobbles" fixes one glibc test case in cost of breaking another one. After that commit, calling clock_nanosleep(TIMER_ABSTIME, X) and then clock_gettime(&Y) can result of Y time being smaller than X time. Reproducer/tester can be found further below, it can be compiled and ran by: gcc -o tst-cpuclock2 tst-cpuclock2.c -pthread while ./tst-cpuclock2 ; do : ; done This reproducer, when running on a buggy kernel, will complain about "clock_gettime difference too small". Issue happens because on start in thread_group_cputimer() we initialize sum_exec_runtime of cputimer with threads runtime not yet accounted and then add the threads runtime to running cputimer again on scheduler tick, making it's sum_exec_runtime bigger than actual threads runtime. KOSAKI Motohiro posted a fix for this problem, but that patch was never applied: https://lkml.org/lkml/2013/5/26/191 . This patch takes different approach to cure the problem. It calls update_curr() when cputimer starts, that assure we will have updated stats of running threads and on the next schedule tick we will account only the runtime that elapsed from cputimer start. That also assure we have consistent state between cpu times of individual threads and cpu time of the process consisted by those threads. Full reproducer (tst-cpuclock2.c): #define _GNU_SOURCE #include <unistd.h> #include <sys/syscall.h> #include <stdio.h> #include <time.h> #include <pthread.h> #include <stdint.h> #include <inttypes.h> /* Parameters for the Linux kernel ABI for CPU clocks. */ #define CPUCLOCK_SCHED 2 #define MAKE_PROCESS_CPUCLOCK(pid, clock) \ ((~(clockid_t) (pid) << 3) | (clockid_t) (clock)) static pthread_barrier_t barrier; /* Help advance the clock. */ static void *chew_cpu(void *arg) { pthread_barrier_wait(&barrier); while (1) ; return NULL; } /* Don't use the glibc wrapper. */ static int do_nanosleep(int flags, const struct timespec *req) { clockid_t clock_id = MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED); return syscall(SYS_clock_nanosleep, clock_id, flags, req, NULL); } static int64_t tsdiff(const struct timespec *before, const struct timespec *after) { int64_t before_i = before->tv_sec * 1000000000ULL + before->tv_nsec; int64_t after_i = after->tv_sec * 1000000000ULL + after->tv_nsec; return after_i - before_i; } int main(void) { int result = 0; pthread_t th; pthread_barrier_init(&barrier, NULL, 2); if (pthread_create(&th, NULL, chew_cpu, NULL) != 0) { perror("pthread_create"); return 1; } pthread_barrier_wait(&barrier); /* The test. */ struct timespec before, after, sleeptimeabs; int64_t sleepdiff, diffabs; const struct timespec sleeptime = {.tv_sec = 0,.tv_nsec = 100000000 }; /* The relative nanosleep. Not sure why this is needed, but its presence seems to make it easier to reproduce the problem. */ if (do_nanosleep(0, &sleeptime) != 0) { perror("clock_nanosleep"); return 1; } /* Get the current time. */ if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &before) < 0) { perror("clock_gettime[2]"); return 1; } /* Compute the absolute sleep time based on the current time. */ uint64_t nsec = before.tv_nsec + sleeptime.tv_nsec; sleeptimeabs.tv_sec = before.tv_sec + nsec / 1000000000; sleeptimeabs.tv_nsec = nsec % 1000000000; /* Sleep for the computed time. */ if (do_nanosleep(TIMER_ABSTIME, &sleeptimeabs) != 0) { perror("absolute clock_nanosleep"); return 1; } /* Get the time after the sleep. */ if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &after) < 0) { perror("clock_gettime[3]"); return 1; } /* The time after sleep should always be equal to or after the absolute sleep time passed to clock_nanosleep. */ sleepdiff = tsdiff(&sleeptimeabs, &after); if (sleepdiff < 0) { printf("absolute clock_nanosleep woke too early: %" PRId64 "\n", sleepdiff); result = 1; printf("Before %llu.%09llu\n", before.tv_sec, before.tv_nsec); printf("After %llu.%09llu\n", after.tv_sec, after.tv_nsec); printf("Sleep %llu.%09llu\n", sleeptimeabs.tv_sec, sleeptimeabs.tv_nsec); } /* The difference between the timestamps taken before and after the clock_nanosleep call should be equal to or more than the duration of the sleep. */ diffabs = tsdiff(&before, &after); if (diffabs < sleeptime.tv_nsec) { printf("clock_gettime difference too small: %" PRId64 "\n", diffabs); result = 1; } pthread_cancel(th); return result; } Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/20141112155843.GA24803@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org>