diff options
Diffstat (limited to 'kernel')
96 files changed, 5789 insertions, 2450 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 53abf008ecb3..2dea801370f2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -19,6 +19,17 @@ CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) endif +# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip() +# in coverage traces. +KCOV_INSTRUMENT_softirq.o := n +# These are called from save_stack_trace() on slub debug path, +# and produce insane amounts of uninteresting coverage. +KCOV_INSTRUMENT_module.o := n +KCOV_INSTRUMENT_extable.o := n +# Don't self-instrument. +KCOV_INSTRUMENT_kcov.o := n +KASAN_SANITIZE_kcov.o := n + # cond_syscall is currently not LTO compatible CFLAGS_sys_ni.o = $(DISABLE_LTO) @@ -69,6 +80,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ +obj-$(CONFIG_KCOV) += kcov.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o diff --git a/kernel/acct.c b/kernel/acct.c index 74963d192c5d..37f1dc696fbd 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -99,7 +99,7 @@ static int check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; - if (time_is_before_jiffies(acct->needcheck)) + if (time_is_after_jiffies(acct->needcheck)) goto out; /* May block */ diff --git a/kernel/async.c b/kernel/async.c index 4c3773c0bf63..f1fd155abff6 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -84,20 +84,24 @@ static atomic_t entry_count; static async_cookie_t lowest_in_progress(struct async_domain *domain) { - struct list_head *pending; + struct async_entry *first = NULL; async_cookie_t ret = ASYNC_COOKIE_MAX; unsigned long flags; spin_lock_irqsave(&async_lock, flags); - if (domain) - pending = &domain->pending; - else - pending = &async_global_pending; + if (domain) { + if (!list_empty(&domain->pending)) + first = list_first_entry(&domain->pending, + struct async_entry, domain_list); + } else { + if (!list_empty(&async_global_pending)) + first = list_first_entry(&async_global_pending, + struct async_entry, global_list); + } - if (!list_empty(pending)) - ret = list_first_entry(pending, struct async_entry, - domain_list)->cookie; + if (first) + ret = first->cookie; spin_unlock_irqrestore(&async_lock, flags); return ret; diff --git a/kernel/audit.c b/kernel/audit.c index 34f690b9213a..e228b88dfd23 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -80,13 +80,13 @@ static int audit_initialized; #define AUDIT_OFF 0 #define AUDIT_ON 1 #define AUDIT_LOCKED 2 -u32 audit_enabled; -u32 audit_ever_enabled; +u32 audit_enabled = AUDIT_OFF; +u32 audit_ever_enabled = !!AUDIT_OFF; EXPORT_SYMBOL_GPL(audit_enabled); /* Default state when kernel boots without any parameters. */ -static u32 audit_default; +static u32 audit_default = AUDIT_OFF; /* If auditing cannot proceed, audit_failure selects what happens. */ static u32 audit_failure = AUDIT_FAIL_PRINTK; @@ -1185,8 +1185,6 @@ static int __init audit_init(void) skb_queue_head_init(&audit_skb_queue); skb_queue_head_init(&audit_skb_hold_queue); audit_initialized = AUDIT_INITIALIZED; - audit_enabled = audit_default; - audit_ever_enabled |= !!audit_default; audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); @@ -1203,6 +1201,8 @@ static int __init audit_enable(char *str) audit_default = !!simple_strtol(str, NULL, 0); if (!audit_default) audit_initialized = AUDIT_DISABLED; + audit_enabled = audit_default; + audit_ever_enabled = !!audit_enabled; pr_info("%s\n", audit_default ? "enabled (after initialization)" : "disabled (until reboot)"); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 939945a5649c..a162661c9d60 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -457,13 +457,15 @@ void audit_remove_watch_rule(struct audit_krule *krule) list_del(&krule->rlist); if (list_empty(&watch->rules)) { + /* + * audit_remove_watch() drops our reference to 'parent' which + * can get freed. Grab our own reference to be safe. + */ + audit_get_parent(parent); audit_remove_watch(watch); - - if (list_empty(&parent->watches)) { - audit_get_parent(parent); + if (list_empty(&parent->watches)) fsnotify_destroy_mark(&parent->mark, audit_watch_group); - audit_put_parent(parent); - } + audit_put_parent(parent); } } diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index b0799bced518..3608fa1aec8a 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -20,8 +20,10 @@ /* Called from syscall */ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { + u32 elem_size, array_size, index_mask, max_entries; + bool unpriv = !capable(CAP_SYS_ADMIN); struct bpf_array *array; - u32 elem_size, array_size; + u64 mask64; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || @@ -36,12 +38,33 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) elem_size = round_up(attr->value_size, 8); + max_entries = attr->max_entries; + + /* On 32 bit archs roundup_pow_of_two() with max_entries that has + * upper most bit set in u32 space is undefined behavior due to + * resulting 1U << 32, so do it manually here in u64 space. + */ + mask64 = fls_long(max_entries - 1); + mask64 = 1ULL << mask64; + mask64 -= 1; + + index_mask = mask64; + if (unpriv) { + /* round up array size to nearest power of 2, + * since cpu will speculate within index_mask limits + */ + max_entries = index_mask + 1; + /* Check for overflows. */ + if (max_entries < attr->max_entries) + return ERR_PTR(-E2BIG); + } + /* check round_up into zero and u32 overflow */ if (elem_size == 0 || - attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size) + max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size) return ERR_PTR(-ENOMEM); - array_size = sizeof(*array) + attr->max_entries * elem_size; + array_size = sizeof(*array) + max_entries * elem_size; /* allocate all map elements and zero-initialize them */ array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); @@ -50,6 +73,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) if (!array) return ERR_PTR(-ENOMEM); } + array->index_mask = index_mask; + array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ array->map.key_size = attr->key_size; @@ -70,7 +95,7 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) if (index >= array->map.max_entries) return NULL; - return array->value + array->elem_size * index; + return array->value + array->elem_size * (index & array->index_mask); } /* Called from syscall */ @@ -111,7 +136,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, /* all elements already exist */ return -EEXIST; - memcpy(array->value + array->elem_size * index, value, map->value_size); + memcpy(array->value + + array->elem_size * (index & array->index_mask), + value, map->value_size); return 0; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 334b1bdd572c..eb52d11fdaa7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -137,6 +137,77 @@ void __bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(__bpf_prog_free); +static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_JMP && + /* Call and Exit are both special jumps with no + * target inside the BPF instruction image. + */ + BPF_OP(insn->code) != BPF_CALL && + BPF_OP(insn->code) != BPF_EXIT; +} + +static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta) +{ + struct bpf_insn *insn = prog->insnsi; + u32 i, insn_cnt = prog->len; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (!bpf_is_jmp_and_has_target(insn)) + continue; + + /* Adjust offset of jmps if we cross boundaries. */ + if (i < pos && i + insn->off + 1 > pos) + insn->off += delta; + else if (i > pos + delta && i + insn->off + 1 <= pos + delta) + insn->off -= delta; + } +} + +struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, + const struct bpf_insn *patch, u32 len) +{ + u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; + struct bpf_prog *prog_adj; + + /* Since our patchlet doesn't expand the image, we're done. */ + if (insn_delta == 0) { + memcpy(prog->insnsi + off, patch, sizeof(*patch)); + return prog; + } + + insn_adj_cnt = prog->len + insn_delta; + + /* Several new instructions need to be inserted. Make room + * for them. Likely, there's no need for a new allocation as + * last page could have large enough tailroom. + */ + prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt), + GFP_USER); + if (!prog_adj) + return NULL; + + prog_adj->len = insn_adj_cnt; + + /* Patching happens in 3 steps: + * + * 1) Move over tail of insnsi from next instruction onwards, + * so we can patch the single target insn with one or more + * new ones (patching is always from 1 to n insns, n > 0). + * 2) Inject new instructions at the target location. + * 3) Adjust branch offsets if necessary. + */ + insn_rest = insn_adj_cnt - off - len; + + memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1, + sizeof(*patch) * insn_rest); + memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len); + + bpf_adj_branches(prog_adj, off, insn_delta); + + return prog_adj; +} + #ifdef CONFIG_BPF_JIT struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, @@ -185,6 +256,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +#ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context * @ctx: is the data we are operating on @@ -372,7 +444,7 @@ select_insn: DST = tmp; CONT; ALU_MOD_X: - if (unlikely(SRC == 0)) + if (unlikely((u32)SRC == 0)) return 0; tmp = (u32) DST; DST = do_div(tmp, (u32) SRC); @@ -391,7 +463,7 @@ select_insn: DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - if (unlikely(SRC == 0)) + if (unlikely((u32)SRC == 0)) return 0; tmp = (u32) DST; do_div(tmp, (u32) SRC); @@ -446,7 +518,7 @@ select_insn: struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog; - u64 index = BPF_R3; + u32 index = BPF_R3; if (unlikely(index >= array->map.max_entries)) goto out; @@ -654,6 +726,13 @@ load_byte: return 0; } +#else +static unsigned int __bpf_prog_ret0(void *ctx, const struct bpf_insn *insn) +{ + return 0; +} +#endif + bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { @@ -700,9 +779,23 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ int bpf_prog_select_runtime(struct bpf_prog *fp) { +#ifndef CONFIG_BPF_JIT_ALWAYS_ON fp->bpf_func = (void *) __bpf_prog_run; - +#else + fp->bpf_func = (void *) __bpf_prog_ret0; +#endif + + /* eBPF JITs can rewrite the program in case constant + * blinding is active. However, in case of error during + * blinding, bpf_int_jit_compile() must always return a + * valid program, which in this case would simply not + * be JITed, but falls back to the interpreter. + */ bpf_int_jit_compile(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + if (!fp->jited) + return -ENOTSUPP; +#endif bpf_prog_lock_ro(fp); /* The tail call compatibility check can only be done at diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4e32cc94edd9..424accd20c2d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -447,57 +447,6 @@ void bpf_register_prog_type(struct bpf_prog_type_list *tl) list_add(&tl->list_node, &bpf_prog_types); } -/* fixup insn->imm field of bpf_call instructions: - * if (insn->imm == BPF_FUNC_map_lookup_elem) - * insn->imm = bpf_map_lookup_elem - __bpf_call_base; - * else if (insn->imm == BPF_FUNC_map_update_elem) - * insn->imm = bpf_map_update_elem - __bpf_call_base; - * else ... - * - * this function is called after eBPF program passed verification - */ -static void fixup_bpf_calls(struct bpf_prog *prog) -{ - const struct bpf_func_proto *fn; - int i; - - for (i = 0; i < prog->len; i++) { - struct bpf_insn *insn = &prog->insnsi[i]; - - if (insn->code == (BPF_JMP | BPF_CALL)) { - /* we reach here when program has bpf_call instructions - * and it passed bpf_check(), means that - * ops->get_func_proto must have been supplied, check it - */ - BUG_ON(!prog->aux->ops->get_func_proto); - - if (insn->imm == BPF_FUNC_get_route_realm) - prog->dst_needed = 1; - if (insn->imm == BPF_FUNC_get_prandom_u32) - bpf_user_rnd_init_once(); - if (insn->imm == BPF_FUNC_tail_call) { - /* mark bpf_tail_call as different opcode - * to avoid conditional branch in - * interpeter for every normal call - * and to prevent accidental JITing by - * JIT compiler that doesn't support - * bpf_tail_call yet - */ - insn->imm = 0; - insn->code |= BPF_X; - continue; - } - - fn = prog->aux->ops->get_func_proto(insn->imm); - /* all functions that have prototype and verifier allowed - * programs to call them, must be real in-kernel functions - */ - BUG_ON(!fn->func); - insn->imm = fn->func - __bpf_call_base; - } - } -} - /* drop refcnt on maps used by eBPF program and free auxilary data */ static void free_used_maps(struct bpf_prog_aux *aux) { @@ -680,9 +629,6 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_used_maps; - /* fixup BPF_CALL->imm field */ - fixup_bpf_calls(prog); - /* eBPF program is ready to be JITed */ err = bpf_prog_select_runtime(prog); if (err < 0) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2cbfba78d3db..c14003840bc5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -186,6 +186,14 @@ struct verifier_stack_elem { struct verifier_stack_elem *next; }; +struct bpf_insn_aux_data { + union { + enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ + struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ + }; + bool seen; /* this insn was processed by the verifier */ +}; + #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ /* single container for all structs @@ -200,6 +208,7 @@ struct verifier_env { struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ bool allow_ptr_leaks; + struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ }; /* verbose verifier prints what it's seeing @@ -313,7 +322,8 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; -static void print_bpf_insn(struct bpf_insn *insn) +static void print_bpf_insn(const struct verifier_env *env, + const struct bpf_insn *insn) { u8 class = BPF_CLASS(insn->code); @@ -377,9 +387,19 @@ static void print_bpf_insn(struct bpf_insn *insn) insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IMM) { - verbose("(%02x) r%d = 0x%x\n", - insn->code, insn->dst_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM && + BPF_SIZE(insn->code) == BPF_DW) { + /* At this point, we already made sure that the second + * part of the ldimm64 insn is accessible. + */ + u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; + bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + + if (map_ptr && !env->allow_ptr_leaks) + imm = 0; + + verbose("(%02x) r%d = 0x%llx\n", insn->code, + insn->dst_reg, (unsigned long long)imm); } else { verbose("BUG_ld_%02x\n", insn->code); return; @@ -663,6 +683,13 @@ static bool is_pointer_value(struct verifier_env *env, int regno) } } +static bool is_ctx_reg(struct verifier_env *env, int regno) +{ + const struct reg_state *reg = &env->cur_state.regs[regno]; + + return reg->type == PTR_TO_CTX; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -754,6 +781,17 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) if (err) return err; + if (is_pointer_value(env, insn->src_reg)) { + verbose("R%d leaks addr into mem\n", insn->src_reg); + return -EACCES; + } + + if (is_ctx_reg(env, insn->dst_reg)) { + verbose("BPF_XADD stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check whether atomic_add can read the memory */ err = check_mem_access(env, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); @@ -929,7 +967,7 @@ error: return -EINVAL; } -static int check_call(struct verifier_env *env, int func_id) +static int check_call(struct verifier_env *env, int func_id, int insn_idx) { struct verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; @@ -965,6 +1003,13 @@ static int check_call(struct verifier_env *env, int func_id) err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map); if (err) return err; + if (func_id == BPF_FUNC_tail_call) { + if (map == NULL) { + verbose("verifier bug\n"); + return -EINVAL; + } + env->insn_aux_data[insn_idx].map_ptr = map; + } err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map); if (err) return err; @@ -1028,7 +1073,8 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) } } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0 || - (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) { + (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || + BPF_CLASS(insn->code) == BPF_ALU64) { verbose("BPF_END uses reserved fields\n"); return -EINVAL; } @@ -1132,6 +1178,11 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) { + verbose("BPF_ARSH not supported for 32 bit ALU\n"); + return -EINVAL; + } + if ((opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; @@ -1758,16 +1809,17 @@ static int do_check(struct verifier_env *env) if (log_level) { verbose("%d: ", insn_idx); - print_bpf_insn(insn); + print_bpf_insn(env, insn); } + env->insn_aux_data[insn_idx].seen = true; if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); if (err) return err; } else if (class == BPF_LDX) { - enum bpf_reg_type src_reg_type; + enum bpf_reg_type *prev_src_type, src_reg_type; /* check for reserved fields is already done */ @@ -1796,16 +1848,18 @@ static int do_check(struct verifier_env *env) continue; } - if (insn->imm == 0) { + prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; + + if (*prev_src_type == NOT_INIT) { /* saw a valid insn * dst_reg = *(u32 *)(src_reg + off) - * use reserved 'imm' field to mark this insn + * save type to validate intersecting paths */ - insn->imm = src_reg_type; + *prev_src_type = src_reg_type; - } else if (src_reg_type != insn->imm && + } else if (src_reg_type != *prev_src_type && (src_reg_type == PTR_TO_CTX || - insn->imm == PTR_TO_CTX)) { + *prev_src_type == PTR_TO_CTX)) { /* ABuser program is trying to use the same insn * dst_reg = *(u32*) (src_reg + off) * with different pointer types: @@ -1818,7 +1872,7 @@ static int do_check(struct verifier_env *env) } } else if (class == BPF_STX) { - enum bpf_reg_type dst_reg_type; + enum bpf_reg_type *prev_dst_type, dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { err = check_xadd(env, insn); @@ -1846,11 +1900,13 @@ static int do_check(struct verifier_env *env) if (err) return err; - if (insn->imm == 0) { - insn->imm = dst_reg_type; - } else if (dst_reg_type != insn->imm && + prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type; + + if (*prev_dst_type == NOT_INIT) { + *prev_dst_type = dst_reg_type; + } else if (dst_reg_type != *prev_dst_type && (dst_reg_type == PTR_TO_CTX || - insn->imm == PTR_TO_CTX)) { + *prev_dst_type == PTR_TO_CTX)) { verbose("same insn cannot be used with different pointers\n"); return -EINVAL; } @@ -1866,6 +1922,12 @@ static int do_check(struct verifier_env *env) if (err) return err; + if (is_ctx_reg(env, insn->dst_reg)) { + verbose("BPF_ST stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, @@ -1885,7 +1947,7 @@ static int do_check(struct verifier_env *env) return -EINVAL; } - err = check_call(env, insn->imm); + err = check_call(env, insn->imm, insn_idx); if (err) return err; @@ -1952,6 +2014,7 @@ process_bpf_exit: return err; insn_idx++; + env->insn_aux_data[insn_idx].seen = true; } else { verbose("invalid BPF_LD mode\n"); return -EINVAL; @@ -2081,23 +2144,60 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) insn->src_reg = 0; } -static void adjust_branches(struct bpf_prog *prog, int pos, int delta) +/* single env->prog->insni[off] instruction was replaced with the range + * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying + * [0, off) and [off, end) to new locations, so the patched range stays zero + */ +static int adjust_insn_aux_data(struct verifier_env *env, u32 prog_len, + u32 off, u32 cnt) { - struct bpf_insn *insn = prog->insnsi; - int insn_cnt = prog->len; + struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; int i; - for (i = 0; i < insn_cnt; i++, insn++) { - if (BPF_CLASS(insn->code) != BPF_JMP || - BPF_OP(insn->code) == BPF_CALL || - BPF_OP(insn->code) == BPF_EXIT) - continue; + if (cnt == 1) + return 0; + new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len); + if (!new_data) + return -ENOMEM; + memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); + memcpy(new_data + off + cnt - 1, old_data + off, + sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); + for (i = off; i < off + cnt - 1; i++) + new_data[i].seen = true; + env->insn_aux_data = new_data; + vfree(old_data); + return 0; +} + +static struct bpf_prog *bpf_patch_insn_data(struct verifier_env *env, u32 off, + const struct bpf_insn *patch, u32 len) +{ + struct bpf_prog *new_prog; + + new_prog = bpf_patch_insn_single(env->prog, off, patch, len); + if (!new_prog) + return NULL; + if (adjust_insn_aux_data(env, new_prog->len, off, len)) + return NULL; + return new_prog; +} - /* adjust offset of jmps if necessary */ - if (i < pos && i + insn->off + 1 > pos) - insn->off += delta; - else if (i > pos + delta && i + insn->off + 1 <= pos + delta) - insn->off -= delta; +/* The verifier does more data flow analysis than llvm and will not explore + * branches that are dead at run time. Malicious programs can have dead code + * too. Therefore replace all dead at-run-time code with nops. + */ +static void sanitize_dead_code(struct verifier_env *env) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); + struct bpf_insn *insn = env->prog->insnsi; + const int insn_cnt = env->prog->len; + int i; + + for (i = 0; i < insn_cnt; i++) { + if (aux_data[i].seen) + continue; + memcpy(insn + i, &nop, sizeof(nop)); } } @@ -2107,17 +2207,18 @@ static void adjust_branches(struct bpf_prog *prog, int pos, int delta) static int convert_ctx_accesses(struct verifier_env *env) { struct bpf_insn *insn = env->prog->insnsi; - int insn_cnt = env->prog->len; + const int insn_cnt = env->prog->len; struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; - u32 cnt; - int i; enum bpf_access_type type; + int i, delta = 0; if (!env->prog->aux->ops->convert_ctx_access) return 0; for (i = 0; i < insn_cnt; i++, insn++) { + u32 cnt; + if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) type = BPF_READ; else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) @@ -2125,11 +2226,8 @@ static int convert_ctx_accesses(struct verifier_env *env) else continue; - if (insn->imm != PTR_TO_CTX) { - /* clear internal mark */ - insn->imm = 0; + if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) continue; - } cnt = env->prog->aux->ops-> convert_ctx_access(type, insn->dst_reg, insn->src_reg, @@ -2139,34 +2237,107 @@ static int convert_ctx_accesses(struct verifier_env *env) return -EINVAL; } - if (cnt == 1) { - memcpy(insn, insn_buf, sizeof(*insn)); - continue; - } - - /* several new insns need to be inserted. Make room for them */ - insn_cnt += cnt - 1; - new_prog = bpf_prog_realloc(env->prog, - bpf_prog_size(insn_cnt), - GFP_USER); + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; - new_prog->len = insn_cnt; + delta += cnt - 1; - memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1, - sizeof(*insn) * (insn_cnt - i - cnt)); + /* keep walking new program and skip insns we just inserted */ + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + } - /* copy substitute insns in place of load instruction */ - memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt); + return 0; +} - /* adjust branches in the whole program */ - adjust_branches(new_prog, i, cnt - 1); +/* fixup insn->imm field of bpf_call instructions + * + * this function is called after eBPF program passed verification + */ +static int fixup_bpf_calls(struct verifier_env *env) +{ + struct bpf_prog *prog = env->prog; + struct bpf_insn *insn = prog->insnsi; + const struct bpf_func_proto *fn; + const int insn_cnt = prog->len; + struct bpf_insn insn_buf[16]; + struct bpf_prog *new_prog; + struct bpf_map *map_ptr; + int i, cnt, delta = 0; - /* keep walking new program and skip insns we just inserted */ - env->prog = new_prog; - insn = new_prog->insnsi + i + cnt - 1; - i += cnt - 1; + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + /* due to JIT bugs clear upper 32-bits of src register + * before div/mod operation + */ + insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); + insn_buf[1] = *insn; + cnt = 2; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + + if (insn->code != (BPF_JMP | BPF_CALL)) + continue; + + if (insn->imm == BPF_FUNC_get_route_realm) + prog->dst_needed = 1; + if (insn->imm == BPF_FUNC_get_prandom_u32) + bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_tail_call) { + /* mark bpf_tail_call as different opcode to avoid + * conditional branch in the interpeter for every normal + * call and to prevent accidental JITing by JIT compiler + * that doesn't support bpf_tail_call yet + */ + insn->imm = 0; + insn->code |= BPF_X; + + /* instead of changing every JIT dealing with tail_call + * emit two extra insns: + * if (index >= max_entries) goto out; + * index &= array->index_mask; + * to avoid out-of-bounds cpu speculation + */ + map_ptr = env->insn_aux_data[i + delta].map_ptr; + if (!map_ptr->unpriv_array) + continue; + insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, + map_ptr->max_entries, 2); + insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, + container_of(map_ptr, + struct bpf_array, + map)->index_mask); + insn_buf[2] = *insn; + cnt = 3; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + + fn = prog->aux->ops->get_func_proto(insn->imm); + /* all functions that have prototype and verifier allowed + * programs to call them, must be real in-kernel functions + */ + if (!fn->func) { + verbose("kernel subsystem misconfigured func %d\n", + insn->imm); + return -EFAULT; + } + insn->imm = fn->func - __bpf_call_base; } return 0; @@ -2210,6 +2381,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!env) return -ENOMEM; + env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * + (*prog)->len); + ret = -ENOMEM; + if (!env->insn_aux_data) + goto err_free_env; env->prog = *prog; /* grab the mutex to protect few globals used by verifier */ @@ -2228,12 +2404,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) /* log_* values have to be sane */ if (log_size < 128 || log_size > UINT_MAX >> 8 || log_level == 0 || log_ubuf == NULL) - goto free_env; + goto err_unlock; ret = -ENOMEM; log_buf = vmalloc(log_size); if (!log_buf) - goto free_env; + goto err_unlock; } else { log_level = 0; } @@ -2262,9 +2438,15 @@ skip_full_check: free_states(env); if (ret == 0) + sanitize_dead_code(env); + + if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); + if (ret == 0) + ret = fixup_bpf_calls(env); + if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); /* verifier log exceeded user supplied buffer */ @@ -2302,14 +2484,16 @@ skip_full_check: free_log_buf: if (log_level) vfree(log_buf); -free_env: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. */ release_maps(env); *prog = env->prog; - kfree(env); +err_unlock: mutex_unlock(&bpf_verifier_lock); + vfree(env->insn_aux_data); +err_free_env: + kfree(env); return ret; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 25cf44889559..3fdb7545852e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -717,10 +717,10 @@ static void css_set_move_task(struct task_struct *task, if (to_cset) { /* - * We are synchronized through cgroup_threadgroup_rwsem - * against PF_EXITING setting such that we can't race - * against cgroup_exit() changing the css_set to - * init_css_set and dropping the old one. + * We are synchronized through css_set_lock against + * PF_EXITING setting such that we can't race against + * cgroup_exit() disassociating the task from the + * css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -2799,6 +2799,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); for_each_root(root) { struct cgroup *from_cgrp; @@ -2813,6 +2814,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } + percpu_up_write(&cgroup_threadgroup_rwsem); mutex_unlock(&cgroup_mutex); return retval; @@ -4072,6 +4074,8 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) mutex_lock(&cgroup_mutex); + percpu_down_write(&cgroup_threadgroup_rwsem); + /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) @@ -4100,6 +4104,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) } while (task && !ret); out_err: cgroup_migrate_finish(&preloaded_csets); + percpu_up_write(&cgroup_threadgroup_rwsem); mutex_unlock(&cgroup_mutex); return ret; } @@ -5701,19 +5706,22 @@ void cgroup_exit(struct task_struct *tsk) int i; /* - * Unlink from @tsk from its css_set. As migration path can't race - * with us, we can check css_set and cg_list without synchronization. + * Avoid potential race with the migrate path. + */ + spin_lock_irq(&css_set_lock); + /* + * Unlink from @tsk from its css_set. */ cset = task_css_set(tsk); if (!list_empty(&tsk->cg_list)) { - spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); - spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); } + spin_unlock_irq(&css_set_lock); + /* see cgroup_post_fork() for details */ for_each_subsys_which(ss, i, &have_exit_callback) ss->exit(tsk); diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config new file mode 100644 index 000000000000..d70829033bb7 --- /dev/null +++ b/kernel/configs/android-base.config @@ -0,0 +1,160 @@ +# KEEP ALPHABETICALLY SORTED +# CONFIG_DEVKMEM is not set +# CONFIG_DEVMEM is not set +# CONFIG_FHANDLE is not set +# CONFIG_INET_LRO is not set +# CONFIG_NFSD is not set +# CONFIG_NFS_FS is not set +# CONFIG_OABI_COMPAT is not set +# CONFIG_SYSVIPC is not set +# CONFIG_USELIB is not set +CONFIG_ANDROID=y +CONFIG_ANDROID_BINDER_IPC=y +CONFIG_ANDROID_LOW_MEMORY_KILLER=y +CONFIG_ARMV8_DEPRECATED=y +CONFIG_ASHMEM=y +CONFIG_AUDIT=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_BPF=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_DEBUG=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_SCHED=y +CONFIG_CP15_BARRIER_EMULATION=y +CONFIG_DEFAULT_SECURITY_SELINUX=y +CONFIG_EMBEDDED=y +CONFIG_FB=y +CONFIG_HARDENED_USERCOPY=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_INET6_AH=y +CONFIG_INET6_ESP=y +CONFIG_INET6_IPCOMP=y +CONFIG_INET=y +CONFIG_INET_DIAG_DESTROY=y +CONFIG_INET_ESP=y +CONFIG_INET_XFRM_MODE_TUNNEL=y +CONFIG_IP6_NF_FILTER=y +CONFIG_IP6_NF_IPTABLES=y +CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_RAW=y +CONFIG_IP6_NF_TARGET_REJECT=y +CONFIG_IPV6=y +CONFIG_IPV6_MIP6=y +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_NF_ARPFILTER=y +CONFIG_IP_NF_ARPTABLES=y +CONFIG_IP_NF_ARP_MANGLE=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_MATCH_AH=y +CONFIG_IP_NF_MATCH_ECN=y +CONFIG_IP_NF_MATCH_TTL=y +CONFIG_IP_NF_NAT=y +CONFIG_IP_NF_RAW=y +CONFIG_IP_NF_SECURITY=y +CONFIG_IP_NF_TARGET_MASQUERADE=y +CONFIG_IP_NF_TARGET_NETMAP=y +CONFIG_IP_NF_TARGET_REDIRECT=y +CONFIG_IP_NF_TARGET_REJECT=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODVERSIONS=y +CONFIG_NET=y +CONFIG_NETDEVICES=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_TPROXY=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y +CONFIG_NETFILTER_XT_MATCH_CONNMARK=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y +CONFIG_NETFILTER_XT_MATCH_HELPER=y +CONFIG_NETFILTER_XT_MATCH_IPRANGE=y +CONFIG_NETFILTER_XT_MATCH_LENGTH=y +CONFIG_NETFILTER_XT_MATCH_LIMIT=y +CONFIG_NETFILTER_XT_MATCH_MAC=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y +CONFIG_NETFILTER_XT_MATCH_POLICY=y +CONFIG_NETFILTER_XT_MATCH_QUOTA=y +CONFIG_NETFILTER_XT_MATCH_SOCKET=y +CONFIG_NETFILTER_XT_MATCH_STATE=y +CONFIG_NETFILTER_XT_MATCH_STATISTIC=y +CONFIG_NETFILTER_XT_MATCH_STRING=y +CONFIG_NETFILTER_XT_MATCH_TIME=y +CONFIG_NETFILTER_XT_MATCH_U32=y +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y +CONFIG_NETFILTER_XT_TARGET_CONNMARK=y +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_NFLOG=y +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y +CONFIG_NETFILTER_XT_TARGET_SECMARK=y +CONFIG_NETFILTER_XT_TARGET_TCPMSS=y +CONFIG_NETFILTER_XT_TARGET_TPROXY=y +CONFIG_NETFILTER_XT_TARGET_TRACE=y +CONFIG_NET_CLS_ACT=y +CONFIG_NET_CLS_U32=y +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_U32=y +CONFIG_NET_KEY=y +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_HTB=y +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CONNTRACK_AMANDA=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_FTP=y +CONFIG_NF_CONNTRACK_H323=y +CONFIG_NF_CONNTRACK_IPV4=y +CONFIG_NF_CONNTRACK_IPV6=y +CONFIG_NF_CONNTRACK_IRC=y +CONFIG_NF_CONNTRACK_NETBIOS_NS=y +CONFIG_NF_CONNTRACK_PPTP=y +CONFIG_NF_CONNTRACK_SANE=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_TFTP=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_NAT=y +CONFIG_NO_HZ=y +CONFIG_PACKET=y +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PPP=y +CONFIG_PPP_BSDCOMP=y +CONFIG_PPP_DEFLATE=y +CONFIG_PPP_MPPE=y +CONFIG_PREEMPT=y +CONFIG_QUOTA=y +CONFIG_RANDOMIZE_BASE=y +CONFIG_RTC_CLASS=y +CONFIG_RT_GROUP_SCHED=y +CONFIG_SECCOMP=y +CONFIG_SECURITY=y +CONFIG_SECURITY_NETWORK=y +CONFIG_SECURITY_SELINUX=y +CONFIG_SETEND_EMULATION=y +CONFIG_STAGING=y +CONFIG_SWP_EMULATION=y +CONFIG_SYNC=y +CONFIG_TUN=y +CONFIG_UNIX=y +CONFIG_USB_GADGET=y +CONFIG_USB_CONFIGFS=y +CONFIG_USB_CONFIGFS_F_FS=y +CONFIG_USB_CONFIGFS_F_MIDI=y +CONFIG_USB_OTG_WAKELOCK=y +CONFIG_XFRM_USER=y diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config new file mode 100644 index 000000000000..297756be369c --- /dev/null +++ b/kernel/configs/android-recommended.config @@ -0,0 +1,125 @@ +# KEEP ALPHABETICALLY SORTED +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +# CONFIG_INPUT_MOUSE is not set +# CONFIG_LEGACY_PTYS is not set +# CONFIG_NF_CONNTRACK_SIP is not set +# CONFIG_PM_WAKELOCKS_GC is not set +# CONFIG_VT is not set +CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_BLK_DEV_DM=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=8192 +CONFIG_COMPACTION=y +CONFIG_DEBUG_RODATA=y +CONFIG_DM_CRYPT=y +CONFIG_DM_UEVENT=y +CONFIG_DM_VERITY=y +CONFIG_DM_VERITY_FEC=y +CONFIG_DRAGONRISE_FF=y +CONFIG_ENABLE_DEFAULT_TRACERS=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_FUSE_FS=y +CONFIG_GREENASIA_FF=y +CONFIG_HIDRAW=y +CONFIG_HID_A4TECH=y +CONFIG_HID_ACRUX=y +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=y +CONFIG_HID_BELKIN=y +CONFIG_HID_CHERRY=y +CONFIG_HID_CHICONY=y +CONFIG_HID_CYPRESS=y +CONFIG_HID_DRAGONRISE=y +CONFIG_HID_ELECOM=y +CONFIG_HID_EMS_FF=y +CONFIG_HID_EZKEY=y +CONFIG_HID_GREENASIA=y +CONFIG_HID_GYRATION=y +CONFIG_HID_HOLTEK=y +CONFIG_HID_KENSINGTON=y +CONFIG_HID_KEYTOUCH=y +CONFIG_HID_KYE=y +CONFIG_HID_LCPOWER=y +CONFIG_HID_LOGITECH=y +CONFIG_HID_LOGITECH_DJ=y +CONFIG_HID_MAGICMOUSE=y +CONFIG_HID_MICROSOFT=y +CONFIG_HID_MONTEREY=y +CONFIG_HID_MULTITOUCH=y +CONFIG_HID_NTRIG=y +CONFIG_HID_ORTEK=y +CONFIG_HID_PANTHERLORD=y +CONFIG_HID_PETALYNX=y +CONFIG_HID_PICOLCD=y +CONFIG_HID_PRIMAX=y +CONFIG_HID_PRODIKEYS=y +CONFIG_HID_ROCCAT=y +CONFIG_HID_SAITEK=y +CONFIG_HID_SAMSUNG=y +CONFIG_HID_SMARTJOYPLUS=y +CONFIG_HID_SONY=y +CONFIG_HID_SPEEDLINK=y +CONFIG_HID_SUNPLUS=y +CONFIG_HID_THRUSTMASTER=y +CONFIG_HID_TIVO=y +CONFIG_HID_TOPSEED=y +CONFIG_HID_TWINHAN=y +CONFIG_HID_UCLOGIC=y +CONFIG_HID_WACOM=y +CONFIG_HID_WALTOP=y +CONFIG_HID_WIIMOTE=y +CONFIG_HID_ZEROPLUS=y +CONFIG_HID_ZYDACRON=y +CONFIG_INPUT_EVDEV=y +CONFIG_INPUT_GPIO=y +CONFIG_INPUT_JOYSTICK=y +CONFIG_INPUT_MISC=y +CONFIG_INPUT_TABLET=y +CONFIG_INPUT_UINPUT=y +CONFIG_ION=y +CONFIG_JOYSTICK_XPAD=y +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_KALLSYMS_ALL=y +CONFIG_KSM=y +CONFIG_LOGIG940_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGITECH_FF=y +CONFIG_MD=y +CONFIG_MEDIA_SUPPORT=y +CONFIG_MSDOS_FS=y +CONFIG_PANIC_TIMEOUT=5 +CONFIG_PANTHERLORD_FF=y +CONFIG_PERF_EVENTS=y +CONFIG_PM_DEBUG=y +CONFIG_PM_RUNTIME=y +CONFIG_PM_WAKELOCKS_LIMIT=0 +CONFIG_POWER_SUPPLY=y +CONFIG_PSTORE=y +CONFIG_PSTORE_CONSOLE=y +CONFIG_PSTORE_RAM=y +CONFIG_SCHEDSTATS=y +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_SND=y +CONFIG_SOUND=y +CONFIG_SUSPEND_TIME=y +CONFIG_TABLET_USB_ACECAD=y +CONFIG_TABLET_USB_AIPTEK=y +CONFIG_TABLET_USB_GTCO=y +CONFIG_TABLET_USB_HANWANG=y +CONFIG_TABLET_USB_KBTAB=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_TASK_XACCT=y +CONFIG_TIMER_STATS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_UHID=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_HIDDEV=y +CONFIG_USB_USBNET=y +CONFIG_VFAT_FS=y diff --git a/kernel/cpu.c b/kernel/cpu.c index e822cb0e18d5..5b4440d57f89 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -91,6 +91,11 @@ static struct { #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) +void cpu_hotplug_mutex_held(void) +{ + lockdep_assert_held(&cpu_hotplug.lock); +} +EXPORT_SYMBOL(cpu_hotplug_mutex_held); void get_online_cpus(void) { @@ -361,6 +366,9 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) if (!cpu_online(cpu)) return -EINVAL; + if (!tasks_frozen && !cpu_isolated(cpu) && num_online_uniso_cpus() == 1) + return -EBUSY; + cpu_hotplug_begin(); err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); @@ -372,6 +380,21 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) goto out_release; } + /* + * By now we've cleared cpu_active_mask, wait for all preempt-disabled + * and RCU users of this state to go away such that all new such users + * will observe it. + * + * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might + * not imply sync_sched(), so wait for both. + * + * Do sync before park smpboot threads to take care the rcu boost case. + */ + if (IS_ENABLED(CONFIG_PREEMPT)) + synchronize_rcu_mult(call_rcu, call_rcu_sched); + else + synchronize_rcu(); + smpboot_park_threads(cpu); /* @@ -505,8 +528,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; - pr_warn("%s: attempt to bring up CPU %u failed\n", - __func__, cpu); + pr_warn_ratelimited("%s: attempt to bring up CPU %u failed\n", + __func__, cpu); goto out_notify; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 29c7240172d3..a599351997ad 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -60,6 +60,7 @@ #include <linux/cgroup.h> #include <linux/wait.h> +struct static_key cpusets_pre_enable_key __read_mostly = STATIC_KEY_INIT_FALSE; struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; /* See "Frequency meter" comments, below. */ @@ -174,9 +175,9 @@ typedef enum { } cpuset_flagbits_t; /* convenient tests for these bits */ -static inline bool is_cpuset_online(const struct cpuset *cs) +static inline bool is_cpuset_online(struct cpuset *cs) { - return test_bit(CS_ONLINE, &cs->flags); + return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); } static inline int is_cpu_exclusive(const struct cpuset *cs) @@ -806,16 +807,15 @@ done: * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes get_online_cpus(). */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_unlocked(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; + cpu_hotplug_mutex_held(); lockdep_assert_held(&cpuset_mutex); - get_online_cpus(); /* * We have raced with CPU hotplug. Don't do anything to avoid @@ -823,27 +823,27 @@ static void rebuild_sched_domains_locked(void) * Anyways, hotplug work item will rebuild sched domains. */ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) - goto out; + return; /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); -out: - put_online_cpus(); } #else /* !CONFIG_SMP */ -static void rebuild_sched_domains_locked(void) +static void rebuild_sched_domains_unlocked(void) { } #endif /* CONFIG_SMP */ void rebuild_sched_domains(void) { + get_online_cpus(); mutex_lock(&cpuset_mutex); - rebuild_sched_domains_locked(); + rebuild_sched_domains_unlocked(); mutex_unlock(&cpuset_mutex); + put_online_cpus(); } /** @@ -875,7 +875,6 @@ static void update_tasks_cpumask(struct cpuset *cs) * * On legacy hierachy, effective_cpus will be the same with cpu_allowed. * - * Called with cpuset_mutex held */ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) { @@ -930,7 +929,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) rcu_read_unlock(); if (need_rebuild_sched_domains) - rebuild_sched_domains_locked(); + rebuild_sched_domains_unlocked(); } /** @@ -1289,7 +1288,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) cs->relax_domain_level = val; if (!cpumask_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains_locked(); + rebuild_sched_domains_unlocked(); } return 0; @@ -1320,7 +1319,6 @@ static void update_tasks_flags(struct cpuset *cs) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * - * Call with cpuset_mutex held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -1355,7 +1353,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) - rebuild_sched_domains_locked(); + rebuild_sched_domains_unlocked(); if (spread_flag_changed) update_tasks_flags(cs); @@ -1620,6 +1618,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = 0; + get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) { retval = -ENODEV; @@ -1657,6 +1656,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return retval; } @@ -1667,6 +1667,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_filetype_t type = cft->private; int retval = -ENODEV; + get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1681,6 +1682,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, } out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); return retval; } @@ -1719,6 +1721,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); + get_online_cpus(); mutex_lock(&cpuset_mutex); if (!is_cpuset_online(cs)) goto out_unlock; @@ -1744,6 +1747,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); + put_online_cpus(); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); flush_workqueue(cpuset_migrate_mm_wq); @@ -1912,6 +1916,7 @@ static struct cftype files[] = { { .name = "memory_pressure", .read_u64 = cpuset_read_u64, + .private = FILE_MEMORY_PRESSURE, }, { @@ -2049,13 +2054,14 @@ out_unlock: /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_locked(). + * will call rebuild_sched_domains_unlocked(). */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { struct cpuset *cs = css_cs(css); + get_online_cpus(); mutex_lock(&cpuset_mutex); if (is_sched_load_balance(cs)) @@ -2065,6 +2071,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) clear_bit(CS_ONLINE, &cs->flags); mutex_unlock(&cpuset_mutex); + put_online_cpus(); } static void cpuset_css_free(struct cgroup_subsys_state *css) @@ -2293,6 +2300,13 @@ retry: mutex_unlock(&cpuset_mutex); } +static bool force_rebuild; + +void cpuset_force_rebuild(void) +{ + force_rebuild = true; +} + /** * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * @@ -2367,8 +2381,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated) + if (cpus_updated || force_rebuild) { + force_rebuild = false; rebuild_sched_domains(); + } } void cpuset_update_active_cpus(bool cpu_online) @@ -2387,6 +2403,11 @@ void cpuset_update_active_cpus(bool cpu_online) schedule_work(&cpuset_hotplug_work); } +void cpuset_wait_for_hotplug(void) +{ + flush_work(&cpuset_hotplug_work); +} + /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. * Call this routine anytime after node_states[N_MEMORY] changes. diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 0b891286a150..3990c1f73e45 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -357,7 +357,7 @@ poll_again: } kdb_printf("\n"); for (i = 0; i < count; i++) { - if (kallsyms_symbol_next(p_tmp, i) < 0) + if (WARN_ON(!kallsyms_symbol_next(p_tmp, i))) break; kdb_printf("%s ", p_tmp); *(p_tmp + len) = '\0'; diff --git a/kernel/events/core.c b/kernel/events/core.c index 95c447e658f7..322f63370038 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3437,22 +3437,27 @@ u64 perf_event_read_local(struct perf_event *event) static int perf_event_read(struct perf_event *event, bool group) { - int ret = 0; + int event_cpu, ret = 0; /* * If event is enabled and currently active on a CPU, update the * value in the event structure: */ + event_cpu = READ_ONCE(event->oncpu); + if (event->state == PERF_EVENT_STATE_ACTIVE && - !cpu_isolated(event->oncpu)) { + !cpu_isolated(event_cpu)) { struct perf_read_data data = { .event = event, .group = group, .ret = 0, }; + + if ((unsigned int)event_cpu >= nr_cpu_ids) + return 0; if (!event->attr.exclude_idle || - !per_cpu(is_idle, event->oncpu)) { - smp_call_function_single(event->oncpu, + !per_cpu(is_idle, event_cpu)) { + smp_call_function_single(event_cpu, __perf_event_read, &data, 1); ret = data.ret; } @@ -7297,6 +7302,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) } event->tp_event->prog = prog; + event->tp_event->bpf_prog_owner = event; return 0; } @@ -7309,7 +7315,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) return; prog = event->tp_event->prog; - if (prog) { + if (prog && event->tp_event->bpf_prog_owner == event) { event->tp_event->prog = NULL; bpf_prog_put_rcu(prog); } @@ -8693,28 +8699,27 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; /* - * Do not allow to attach to a group in a different - * task or CPU context: + * Make sure we're both events for the same CPU; + * grouping events for different CPUs is broken; since + * you can never concurrently schedule them anyhow. */ - if (move_group) { - /* - * Make sure we're both on the same task, or both - * per-cpu events. - */ - if (group_leader->ctx->task != ctx->task) - goto err_context; + if (group_leader->cpu != event->cpu) + goto err_context; - /* - * Make sure we're both events for the same CPU; - * grouping events for different CPUs is broken; since - * you can never concurrently schedule them anyhow. - */ - if (group_leader->cpu != event->cpu) - goto err_context; - } else { - if (group_leader->ctx != ctx) - goto err_context; - } + /* + * Make sure we're both on the same task, or both + * per-CPU events. + */ + if (group_leader->ctx->task != ctx->task) + goto err_context; + + /* + * Do not allow to attach to a group in a different task + * or CPU context. If we're moving SW events, we'll fix + * this up later, so allow that. + */ + if (!move_group && group_leader->ctx != ctx) + goto err_context; /* * Only a group leader can be exclusive or pinned diff --git a/kernel/exit.c b/kernel/exit.c index d8a12cc06aee..06d54f550c36 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -53,6 +53,7 @@ #include <linux/oom.h> #include <linux/writeback.h> #include <linux/shm.h> +#include <linux/kcov.h> #include "sched/tune.h" @@ -669,6 +670,7 @@ void do_exit(long code) TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); + kcov_task_exit(tsk); WARN_ON(blk_needs_flush_plug(tsk)); @@ -762,7 +764,7 @@ void do_exit(long code) disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); - exit_thread(); + exit_thread(tsk); /* * Flush inherited counters to the parent - before the parent diff --git a/kernel/extable.c b/kernel/extable.c index e820ccee9846..4f06fc34313f 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -66,7 +66,7 @@ static inline int init_kernel_text(unsigned long addr) return 0; } -int core_kernel_text(unsigned long addr) +int notrace core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && addr < (unsigned long)_etext) diff --git a/kernel/fork.c b/kernel/fork.c index 2845c5bdc8e3..4251e3806640 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -59,6 +59,7 @@ #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/freezer.h> +#include <linux/kaiser.h> #include <linux/delayacct.h> #include <linux/taskstats_kern.h> #include <linux/random.h> @@ -76,6 +77,7 @@ #include <linux/aio.h> #include <linux/compiler.h> #include <linux/sysctl.h> +#include <linux/kcov.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -148,18 +150,18 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -void __weak arch_release_thread_info(struct thread_info *ti) +void __weak arch_release_thread_stack(unsigned long *stack) { } -#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR +#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ # if THREAD_SIZE >= PAGE_SIZE -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, @@ -168,30 +170,33 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, return page ? page_address(page) : NULL; } -static inline void free_thread_info(struct thread_info *ti) +static inline void free_thread_stack(unsigned long *stack) { - kasan_alloc_pages(virt_to_page(ti), THREAD_SIZE_ORDER); - free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + struct page *page = virt_to_page(stack); + + kasan_alloc_pages(page, THREAD_SIZE_ORDER); + kaiser_unmap_thread_stack(stack); + __free_kmem_pages(page, THREAD_SIZE_ORDER); } # else -static struct kmem_cache *thread_info_cache; +static struct kmem_cache *thread_stack_cache; -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { - return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); + return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_thread_info(struct thread_info *ti) +static void free_thread_stack(unsigned long *stack) { - kmem_cache_free(thread_info_cache, ti); + kmem_cache_free(thread_stack_cache, stack); } -void thread_info_cache_init(void) +void thread_stack_cache_init(void) { - thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, + thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE, THREAD_SIZE, 0, NULL); - BUG_ON(thread_info_cache == NULL); + BUG_ON(thread_stack_cache == NULL); } # endif #endif @@ -214,9 +219,9 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -static void account_kernel_stack(struct thread_info *ti, int account) +static void account_kernel_stack(unsigned long *stack, int account) { - struct zone *zone = page_zone(virt_to_page(ti)); + struct zone *zone = page_zone(virt_to_page(stack)); mod_zone_page_state(zone, NR_KERNEL_STACK, account); } @@ -224,8 +229,8 @@ static void account_kernel_stack(struct thread_info *ti, int account) void free_task(struct task_struct *tsk) { account_kernel_stack(tsk->stack, -1); - arch_release_thread_info(tsk->stack); - free_thread_info(tsk->stack); + arch_release_thread_stack(tsk->stack); + free_thread_stack(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); @@ -336,7 +341,7 @@ void set_task_stack_end_magic(struct task_struct *tsk) static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; - struct thread_info *ti; + unsigned long *stack; int err; if (node == NUMA_NO_NODE) @@ -345,15 +350,19 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!tsk) return NULL; - ti = alloc_thread_info_node(tsk, node); - if (!ti) + stack = alloc_thread_stack_node(tsk, node); + if (!stack) goto free_tsk; err = arch_dup_task_struct(tsk, orig); if (err) - goto free_ti; + goto free_stack; + + tsk->stack = stack; - tsk->stack = ti; + err = kaiser_map_thread_stack(tsk->stack); + if (err) + goto free_stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under @@ -370,7 +379,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR - tsk->stack_canary = get_random_int(); + tsk->stack_canary = get_random_long(); #endif /* @@ -385,12 +394,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - account_kernel_stack(ti, 1); + account_kernel_stack(stack, 1); + + kcov_task_init(tsk); return tsk; -free_ti: - free_thread_info(ti); +free_stack: + free_thread_stack(stack); free_tsk: free_task_struct(tsk); return NULL; @@ -695,6 +706,26 @@ void __mmdrop(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmdrop); +static inline void __mmput(struct mm_struct *mm) +{ + VM_BUG_ON(atomic_read(&mm->mm_users)); + + uprobe_clear_state(mm); + exit_aio(mm); + ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ + exit_mmap(mm); + set_mm_exe_file(mm, NULL); + if (!list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_del(&mm->mmlist); + spin_unlock(&mmlist_lock); + } + if (mm->binfmt) + module_put(mm->binfmt->module); + mmdrop(mm); +} + /* * Decrement the use count and release all resources for an mm. */ @@ -704,26 +735,27 @@ int mmput(struct mm_struct *mm) might_sleep(); if (atomic_dec_and_test(&mm->mm_users)) { - uprobe_clear_state(mm); - exit_aio(mm); - ksm_exit(mm); - khugepaged_exit(mm); /* must run before exit_mmap */ - exit_mmap(mm); - set_mm_exe_file(mm, NULL); - if (!list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - list_del(&mm->mmlist); - spin_unlock(&mmlist_lock); - } - if (mm->binfmt) - module_put(mm->binfmt->module); - mmdrop(mm); + __mmput(mm); mm_freed = 1; } return mm_freed; } EXPORT_SYMBOL_GPL(mmput); +static void mmput_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); + __mmput(mm); +} + +void mmput_async(struct mm_struct *mm) +{ + if (atomic_dec_and_test(&mm->mm_users)) { + INIT_WORK(&mm->async_put_work, mmput_async_fn); + schedule_work(&mm->async_put_work); + } +} + /** * set_mm_exe_file - change a reference to the mm's executable file * @@ -832,8 +864,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) mm = get_task_mm(task); if (mm && mm != current->mm && - !ptrace_may_access(task, mode) && - !capable(CAP_SYS_RESOURCE)) { + !ptrace_may_access(task, mode)) { mmput(mm); mm = ERR_PTR(-EACCES); } diff --git a/kernel/futex.c b/kernel/futex.c index af29863f3349..a09c1dd1f659 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1621,6 +1621,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_q *this, *next; WAKE_Q(wake_q); + if (nr_wake < 0 || nr_requeue < 0) + return -EINVAL; + if (requeue_pi) { /* * Requeue PI only works on two distinct uaddrs. This @@ -1939,8 +1942,12 @@ static int unqueue_me(struct futex_q *q) /* In the common case we don't take the spinlock, which is nice. */ retry: - lock_ptr = q->lock_ptr; - barrier(); + /* + * q->lock_ptr can change between this read and the following spin_lock. + * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and + * optimizing lock_ptr out of the logic below. + */ + lock_ptr = READ_ONCE(q->lock_ptr); if (lock_ptr != NULL) { spin_lock(lock_ptr); /* diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index c92e44855ddd..1276aabaab55 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -37,6 +37,7 @@ config ARCH_HAS_GCOV_PROFILE_ALL config GCOV_PROFILE_ALL bool "Profile entire Kernel" + depends on !COMPILE_TEST depends on GCOV_KERNEL depends on ARCH_HAS_GCOV_PROFILE_ALL default n diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 7080ae1eb6c1..f850e906564b 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -98,6 +98,12 @@ void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) } EXPORT_SYMBOL(__gcov_merge_icall_topn); +void __gcov_exit(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_exit); + /** * gcov_enable_events - enable event reporting through gcov_event() * diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index e25e92fb44fa..46a18e72bce6 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,9 @@ #include <linux/vmalloc.h> #include "gcov.h" -#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1 +#if (__GNUC__ >= 7) +#define GCOV_COUNTERS 9 +#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) #define GCOV_COUNTERS 10 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 #define GCOV_COUNTERS 9 diff --git a/kernel/groups.c b/kernel/groups.c index 74d431d25251..5ea9847f172f 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -101,7 +101,7 @@ static int groups_from_user(struct group_info *group_info, } /* a simple Shell sort */ -static void groups_sort(struct group_info *group_info) +void groups_sort(struct group_info *group_info) { int base, max, stride; int gidsetsize = group_info->ngroups; @@ -128,6 +128,7 @@ static void groups_sort(struct group_info *group_info) stride /= 3; } } +EXPORT_SYMBOL(groups_sort); /* a simple bsearch */ int groups_search(const struct group_info *group_info, kgid_t grp) @@ -159,7 +160,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp) void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); - groups_sort(group_info); get_group_info(group_info); new->group_info = group_info; } @@ -243,6 +243,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 6c8e154c7384..4684b7595e63 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -36,10 +36,32 @@ static bool migrate_one_irq(struct irq_desc *desc) affinity = &available_cpus; if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + /* + * The order of preference for selecting a fallback CPU is + * + * (1) online and un-isolated CPU from default affinity + * (2) online and un-isolated CPU + * (3) online CPU + */ cpumask_andnot(&available_cpus, cpu_online_mask, cpu_isolated_mask); - if (cpumask_empty(affinity)) + if (cpumask_intersects(&available_cpus, irq_default_affinity)) + cpumask_and(&available_cpus, &available_cpus, + irq_default_affinity); + else if (cpumask_empty(&available_cpus)) affinity = cpu_online_mask; + + /* + * We are overriding the affinity with all online and + * un-isolated cpus. irq_set_affinity_locked() call + * below notify this mask to PM QOS affinity listener. + * That results in applying the CPU_DMA_LATENCY QOS + * to all the CPUs specified in the mask. But the low + * level irqchip driver sets the affinity of an irq + * to only one CPU. So pick only one CPU from the + * prepared mask while overriding the user affinity. + */ + affinity = cpumask_of(cpumask_any(affinity)); ret = true; } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e5c70dcb7f8e..2c2effdb4437 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1305,8 +1305,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) ret = __irq_set_trigger(desc, new->flags & IRQF_TRIGGER_MASK); - if (ret) + if (ret) { + irq_release_resources(desc); goto out_mask; + } } desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a24c5b909047..b05509af0352 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -114,6 +114,11 @@ static ssize_t write_irq_affinity(int type, struct file *file, goto free_cpumask; } + if (cpumask_subset(new_value, cpu_isolated_mask)) { + err = -EINVAL; + goto free_cpumask; + } + /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 453ec4232852..e863b2339174 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -553,7 +553,7 @@ static __init int jump_label_test(void) return 0; } -late_initcall(jump_label_test); +early_initcall(jump_label_test); #endif /* STATIC_KEYS_SELFTEST */ #endif /* HAVE_JUMP_LABEL */ diff --git a/kernel/kcov.c b/kernel/kcov.c new file mode 100644 index 000000000000..5813e9375a93 --- /dev/null +++ b/kernel/kcov.c @@ -0,0 +1,431 @@ +#define pr_fmt(fmt) "kcov: " fmt + +#define DISABLE_BRANCH_PROFILING +#include <linux/atomic.h> +#include <linux/compiler.h> +#include <linux/errno.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/preempt.h> +#include <linux/printk.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/kcov.h> +#include <asm/setup.h> + +/* Number of 64-bit words written per one comparison: */ +#define KCOV_WORDS_PER_CMP 4 + +/* + * kcov descriptor (one per opened debugfs file). + * State transitions of the descriptor: + * - initial state after open() + * - then there must be a single ioctl(KCOV_INIT_TRACE) call + * - then, mmap() call (several calls are allowed but not useful) + * - then, ioctl(KCOV_ENABLE, arg), where arg is + * KCOV_TRACE_PC - to trace only the PCs + * or + * KCOV_TRACE_CMP - to trace only the comparison operands + * - then, ioctl(KCOV_DISABLE) to disable the task. + * Enabling/disabling ioctls can be repeated (only one task a time allowed). + */ +struct kcov { + /* + * Reference counter. We keep one for: + * - opened file descriptor + * - task with enabled coverage (we can't unwire it from another task) + */ + atomic_t refcount; + /* The lock protects mode, size, area and t. */ + spinlock_t lock; + enum kcov_mode mode; + /* Size of arena (in long's for KCOV_MODE_TRACE). */ + unsigned size; + /* Coverage buffer shared with user space. */ + void *area; + /* Task for which we collect coverage, or NULL. */ + struct task_struct *t; +}; + +static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) +{ + enum kcov_mode mode; + + /* + * We are interested in code coverage as a function of a syscall inputs, + * so we ignore code executed in interrupts. + */ + if (!in_task()) + return false; + mode = READ_ONCE(t->kcov_mode); + /* + * There is some code that runs in interrupts but for which + * in_interrupt() returns false (e.g. preempt_schedule_irq()). + * READ_ONCE()/barrier() effectively provides load-acquire wrt + * interrupts, there are paired barrier()/WRITE_ONCE() in + * kcov_ioctl_locked(). + */ + barrier(); + return mode == needed_mode; +} + +static unsigned long canonicalize_ip(unsigned long ip) +{ +#ifdef CONFIG_RANDOMIZE_BASE + ip -= kaslr_offset(); +#endif + return ip; +} + +/* + * Entry point from instrumented code. + * This is called once per basic-block/edge. + */ +void notrace __sanitizer_cov_trace_pc(void) +{ + struct task_struct *t; + unsigned long *area; + unsigned long ip = canonicalize_ip(_RET_IP_); + unsigned long pos; + + t = current; + if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t)) + return; + + area = t->kcov_area; + /* The first 64-bit word is the number of subsequent PCs. */ + pos = READ_ONCE(area[0]) + 1; + if (likely(pos < t->kcov_size)) { + area[pos] = ip; + WRITE_ONCE(area[0], pos); + } +} +EXPORT_SYMBOL(__sanitizer_cov_trace_pc); + +#ifdef CONFIG_KCOV_ENABLE_COMPARISONS +static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip) +{ + struct task_struct *t; + u64 *area; + u64 count, start_index, end_pos, max_pos; + + t = current; + if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t)) + return; + + ip = canonicalize_ip(ip); + + /* + * We write all comparison arguments and types as u64. + * The buffer was allocated for t->kcov_size unsigned longs. + */ + area = (u64 *)t->kcov_area; + max_pos = t->kcov_size * sizeof(unsigned long); + + count = READ_ONCE(area[0]); + + /* Every record is KCOV_WORDS_PER_CMP 64-bit words. */ + start_index = 1 + count * KCOV_WORDS_PER_CMP; + end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64); + if (likely(end_pos <= max_pos)) { + area[start_index] = type; + area[start_index + 1] = arg1; + area[start_index + 2] = arg2; + area[start_index + 3] = ip; + WRITE_ONCE(area[0], count + 1); + } +} + +void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1); + +void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2); + +void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4); + +void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8); + +void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2, + _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1); + +void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2, + _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2); + +void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2, + _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4); + +void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2) +{ + write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2, + _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8); + +void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases) +{ + u64 i; + u64 count = cases[0]; + u64 size = cases[1]; + u64 type = KCOV_CMP_CONST; + + switch (size) { + case 8: + type |= KCOV_CMP_SIZE(0); + break; + case 16: + type |= KCOV_CMP_SIZE(1); + break; + case 32: + type |= KCOV_CMP_SIZE(2); + break; + case 64: + type |= KCOV_CMP_SIZE(3); + break; + default: + return; + } + for (i = 0; i < count; i++) + write_comp_data(type, cases[i + 2], val, _RET_IP_); +} +EXPORT_SYMBOL(__sanitizer_cov_trace_switch); +#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */ + +static void kcov_get(struct kcov *kcov) +{ + atomic_inc(&kcov->refcount); +} + +static void kcov_put(struct kcov *kcov) +{ + if (atomic_dec_and_test(&kcov->refcount)) { + vfree(kcov->area); + kfree(kcov); + } +} + +void kcov_task_init(struct task_struct *t) +{ + t->kcov_mode = KCOV_MODE_DISABLED; + t->kcov_size = 0; + t->kcov_area = NULL; + t->kcov = NULL; +} + +void kcov_task_exit(struct task_struct *t) +{ + struct kcov *kcov; + + kcov = t->kcov; + if (kcov == NULL) + return; + spin_lock(&kcov->lock); + if (WARN_ON(kcov->t != t)) { + spin_unlock(&kcov->lock); + return; + } + /* Just to not leave dangling references behind. */ + kcov_task_init(t); + kcov->t = NULL; + kcov->mode = KCOV_MODE_INIT; + spin_unlock(&kcov->lock); + kcov_put(kcov); +} + +static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) +{ + int res = 0; + void *area; + struct kcov *kcov = vma->vm_file->private_data; + unsigned long size, off; + struct page *page; + + area = vmalloc_user(vma->vm_end - vma->vm_start); + if (!area) + return -ENOMEM; + + spin_lock(&kcov->lock); + size = kcov->size * sizeof(unsigned long); + if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || + vma->vm_end - vma->vm_start != size) { + res = -EINVAL; + goto exit; + } + if (!kcov->area) { + kcov->area = area; + vma->vm_flags |= VM_DONTEXPAND; + spin_unlock(&kcov->lock); + for (off = 0; off < size; off += PAGE_SIZE) { + page = vmalloc_to_page(kcov->area + off); + if (vm_insert_page(vma, vma->vm_start + off, page)) + WARN_ONCE(1, "vm_insert_page() failed"); + } + return 0; + } +exit: + spin_unlock(&kcov->lock); + vfree(area); + return res; +} + +static int kcov_open(struct inode *inode, struct file *filep) +{ + struct kcov *kcov; + + kcov = kzalloc(sizeof(*kcov), GFP_KERNEL); + if (!kcov) + return -ENOMEM; + kcov->mode = KCOV_MODE_DISABLED; + atomic_set(&kcov->refcount, 1); + spin_lock_init(&kcov->lock); + filep->private_data = kcov; + return nonseekable_open(inode, filep); +} + +static int kcov_close(struct inode *inode, struct file *filep) +{ + kcov_put(filep->private_data); + return 0; +} + +static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, + unsigned long arg) +{ + struct task_struct *t; + unsigned long size, unused; + + switch (cmd) { + case KCOV_INIT_TRACE: + /* + * Enable kcov in trace mode and setup buffer size. + * Must happen before anything else. + */ + if (kcov->mode != KCOV_MODE_DISABLED) + return -EBUSY; + /* + * Size must be at least 2 to hold current position and one PC. + * Later we allocate size * sizeof(unsigned long) memory, + * that must not overflow. + */ + size = arg; + if (size < 2 || size > INT_MAX / sizeof(unsigned long)) + return -EINVAL; + kcov->size = size; + kcov->mode = KCOV_MODE_INIT; + return 0; + case KCOV_ENABLE: + /* + * Enable coverage for the current task. + * At this point user must have been enabled trace mode, + * and mmapped the file. Coverage collection is disabled only + * at task exit or voluntary by KCOV_DISABLE. After that it can + * be enabled for another task. + */ + if (kcov->mode != KCOV_MODE_INIT || !kcov->area) + return -EINVAL; + if (kcov->t != NULL) + return -EBUSY; + if (arg == KCOV_TRACE_PC) + kcov->mode = KCOV_MODE_TRACE_PC; + else if (arg == KCOV_TRACE_CMP) +#ifdef CONFIG_KCOV_ENABLE_COMPARISONS + kcov->mode = KCOV_MODE_TRACE_CMP; +#else + return -ENOTSUPP; +#endif + else + return -EINVAL; + t = current; + /* Cache in task struct for performance. */ + t->kcov_size = kcov->size; + t->kcov_area = kcov->area; + /* See comment in check_kcov_mode(). */ + barrier(); + WRITE_ONCE(t->kcov_mode, kcov->mode); + t->kcov = kcov; + kcov->t = t; + /* This is put either in kcov_task_exit() or in KCOV_DISABLE. */ + kcov_get(kcov); + return 0; + case KCOV_DISABLE: + /* Disable coverage for the current task. */ + unused = arg; + if (unused != 0 || current->kcov != kcov) + return -EINVAL; + t = current; + if (WARN_ON(kcov->t != t)) + return -EINVAL; + kcov_task_init(t); + kcov->t = NULL; + kcov->mode = KCOV_MODE_INIT; + kcov_put(kcov); + return 0; + default: + return -ENOTTY; + } +} + +static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct kcov *kcov; + int res; + + kcov = filep->private_data; + spin_lock(&kcov->lock); + res = kcov_ioctl_locked(kcov, cmd, arg); + spin_unlock(&kcov->lock); + return res; +} + +static const struct file_operations kcov_fops = { + .open = kcov_open, + .unlocked_ioctl = kcov_ioctl, + .compat_ioctl = kcov_ioctl, + .mmap = kcov_mmap, + .release = kcov_close, +}; + +static int __init kcov_init(void) +{ + if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) { + pr_err("failed to create kcov in debugfs\n"); + return -ENOMEM; + } + return 0; +} + +device_initcall(kcov_init); diff --git a/kernel/kthread.c b/kernel/kthread.c index 850b255649a2..d9b0be5c6a5f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -65,7 +65,7 @@ static inline struct kthread *to_kthread(struct task_struct *k) static struct kthread *to_live_kthread(struct task_struct *k) { struct completion *vfork = ACCESS_ONCE(k->vfork_done); - if (likely(vfork)) + if (likely(vfork) && try_get_task_stack(k)) return __to_kthread(vfork); return NULL; } @@ -427,8 +427,10 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_live_kthread(k); - if (kthread) + if (kthread) { __kthread_unpark(k, kthread); + put_task_stack(k); + } } EXPORT_SYMBOL_GPL(kthread_unpark); @@ -457,6 +459,7 @@ int kthread_park(struct task_struct *k) wait_for_completion(&kthread->parked); } } + put_task_stack(k); ret = 0; } return ret; @@ -492,6 +495,7 @@ int kthread_stop(struct task_struct *k) __kthread_unpark(k, kthread); wake_up_process(k); wait_for_completion(&kthread->exited); + put_task_stack(k); } ret = k->exit_code; put_task_struct(k); @@ -604,6 +608,19 @@ repeat: } EXPORT_SYMBOL_GPL(kthread_worker_fn); +/* + * Returns true when the work could not be queued at the moment. + * It happens when it is already pending in a worker list + * or when it is being cancelled. + */ +static inline bool queuing_blocked(struct kthread_worker *worker, + struct kthread_work *work) +{ + lockdep_assert_held(&worker->lock); + + return !list_empty(&work->node) || work->canceling; +} + /* insert @work before @pos in @worker */ static void insert_kthread_work(struct kthread_worker *worker, struct kthread_work *work, @@ -633,7 +650,7 @@ bool queue_kthread_work(struct kthread_worker *worker, unsigned long flags; spin_lock_irqsave(&worker->lock, flags); - if (list_empty(&work->node)) { + if (!queuing_blocked(worker, work)) { insert_kthread_work(worker, work, &worker->work_list); ret = true; } @@ -694,6 +711,87 @@ retry: } EXPORT_SYMBOL_GPL(flush_kthread_work); +/* + * This function removes the work from the worker queue. Also it makes sure + * that it won't get queued later via the delayed work's timer. + * + * The work might still be in use when this function finishes. See the + * current_work proceed by the worker. + * + * Return: %true if @work was pending and successfully canceled, + * %false if @work was not pending + */ +static bool __kthread_cancel_work(struct kthread_work *work, + unsigned long *flags) +{ + /* + * Try to remove the work from a worker list. It might either + * be from worker->work_list or from worker->delayed_work_list. + */ + if (!list_empty(&work->node)) { + list_del_init(&work->node); + return true; + } + + return false; +} + +static bool __kthread_cancel_work_sync(struct kthread_work *work) +{ + struct kthread_worker *worker = work->worker; + unsigned long flags; + int ret = false; + + if (!worker) + goto out; + + spin_lock_irqsave(&worker->lock, flags); + /* Work must not be used with >1 worker, see kthread_queue_work(). */ + WARN_ON_ONCE(work->worker != worker); + + ret = __kthread_cancel_work(work, &flags); + + if (worker->current_work != work) + goto out_fast; + + /* + * The work is in progress and we need to wait with the lock released. + * In the meantime, block any queuing by setting the canceling counter. + */ + work->canceling++; + spin_unlock_irqrestore(&worker->lock, flags); + flush_kthread_work(work); + spin_lock_irqsave(&worker->lock, flags); + work->canceling--; + +out_fast: + spin_unlock_irqrestore(&worker->lock, flags); +out: + return ret; +} + +/** + * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish + * @work: the kthread work to cancel + * + * Cancel @work and wait for its execution to finish. This function + * can be used even if the work re-queues itself. On return from this + * function, @work is guaranteed to be not pending or executing on any CPU. + * + * kthread_cancel_work_sync(&delayed_work->work) must not be used for + * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. + * + * The caller must ensure that the worker on which @work was last + * queued can't be destroyed before this function returns. + * + * Return: %true if @work was pending, %false otherwise. + */ +bool kthread_cancel_work_sync(struct kthread_work *work) +{ + return __kthread_cancel_work_sync(work); +} +EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); + /** * flush_kthread_worker - flush all current works on a kthread_worker * @worker: worker to flush diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 8e96f6cc2a4a..31322a4275cd 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,3 +1,6 @@ +# Any varying coverage in these files is non-deterministic +# and is generally not a function of system call inputs. +KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 60ace56618f6..0e2c4911ba61 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3128,10 +3128,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (depth) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { - if (hlock->references) + if (hlock->references) { + /* + * Check: unsigned int references:12, overflow. + */ + if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1)) + return 0; + hlock->references++; - else + } else { hlock->references = 2; + } return 1; } diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8ef1919d63b2..d580b7d6ee6d 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -776,6 +776,8 @@ static void lock_torture_cleanup(void) else lock_torture_print_module_parms(cxt.cur_ops, "End of test: SUCCESS"); + kfree(cxt.lwsa); + kfree(cxt.lrsa); torture_cleanup_end(); } @@ -917,6 +919,8 @@ static int __init lock_torture_init(void) GFP_KERNEL); if (reader_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); + kfree(writer_tasks); + writer_tasks = NULL; firsterr = -ENOMEM; goto unwind; } diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 3ef3736002d8..9c951fade415 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter) } void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) + struct task_struct *task) { SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); /* Mark the current thread as blocked on the lock: */ - ti->task->blocked_on = waiter; + task->blocked_on = waiter; } void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) + struct task_struct *task) { DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); - DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); - DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); - ti->task->blocked_on = NULL; + DEBUG_LOCKS_WARN_ON(waiter->task != task); + DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); + task->blocked_on = NULL; list_del_init(&waiter->list); waiter->task = NULL; diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 0799fd3e4cfa..d06ae3bb46c5 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -20,9 +20,9 @@ extern void debug_mutex_wake_waiter(struct mutex *lock, extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); extern void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti); + struct task_struct *task); extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti); + struct task_struct *task); extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 14b9cca36b05..c61c56f05dfa 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -549,7 +549,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto skip_wait; debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); + debug_mutex_add_waiter(lock, &waiter, task); /* add waiting tasks to the end of the waitqueue (FIFO): */ list_add_tail(&waiter.list, &lock->wait_list); @@ -596,7 +596,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } __set_task_state(task, TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, current_thread_info()); + mutex_remove_waiter(lock, &waiter, task); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); @@ -617,7 +617,7 @@ skip_wait: return 0; err: - mutex_remove_waiter(lock, &waiter, task_thread_info(task)); + mutex_remove_waiter(lock, &waiter, task); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, 1, ip); @@ -731,6 +731,7 @@ static inline void __mutex_unlock_common_slowpath(struct mutex *lock, int nested) { unsigned long flags; + WAKE_Q(wake_q); /* * As a performance measurement, release the lock before doing other @@ -758,11 +759,11 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested) struct mutex_waiter, list); debug_mutex_wake_waiter(lock, waiter); - - wake_up_process(waiter->task); + wake_q_add(&wake_q, waiter->task); } spin_unlock_mutex(&lock->wait_lock, flags); + wake_up_q(&wake_q); } /* diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 5cda397607f2..a68bae5e852a 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -13,7 +13,7 @@ do { spin_lock(lock); (void)(flags); } while (0) #define spin_unlock_mutex(lock, flags) \ do { spin_unlock(lock); (void)(flags); } while (0) -#define mutex_remove_waiter(lock, waiter, ti) \ +#define mutex_remove_waiter(lock, waiter, task) \ __list_del((waiter)->list.prev, (waiter)->list.next) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 05a37857ab55..0befa20ce96e 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -1,6 +1,7 @@ #include <linux/percpu.h> #include <linux/sched.h> #include <linux/osq_lock.h> +#include <linux/sched/rt.h> /* * An MCS like lock especially tailored for optimistic spinning for sleeping @@ -85,6 +86,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) { struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); struct optimistic_spin_node *prev, *next; + struct task_struct *task = current; int curr = encode_cpu(smp_processor_id()); int old; @@ -104,6 +106,19 @@ bool osq_lock(struct optimistic_spin_queue *lock) prev = decode_cpu(old); node->prev = prev; + + /* + * osq_lock() unqueue + * + * node->prev = prev osq_wait_next() + * WMB MB + * prev->next = node next->prev = prev // unqueue-C + * + * Here 'node->prev' and 'next->prev' are the same variable and we need + * to ensure these stores happen in-order to avoid corrupting the list. + */ + smp_wmb(); + WRITE_ONCE(prev->next, node); /* @@ -118,8 +133,13 @@ bool osq_lock(struct optimistic_spin_queue *lock) while (!READ_ONCE(node->locked)) { /* * If we need to reschedule bail... so we can block. + * If a task spins on owner on a CPU after acquiring + * osq_lock while a RT task spins on another CPU to + * acquire osq_lock, it will starve the owner from + * completing if owner is to be scheduled on the same CPU. + * It will be a live lock. */ - if (need_resched()) + if (need_resched() || rt_task(task)) goto unqueue; cpu_relax_lowlatency(); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index a4d4de05b2d1..75c950ede9c7 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -511,6 +511,41 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) unsigned long flags; /* + * If a spinner is present, there is a chance that the load of + * rwsem_has_spinner() in rwsem_wake() can be reordered with + * respect to decrement of rwsem count in __up_write() leading + * to wakeup being missed. + * + * spinning writer up_write caller + * --------------- ----------------------- + * [S] osq_unlock() [L] osq + * spin_lock(wait_lock) + * sem->count=0xFFFFFFFF00000001 + * +0xFFFFFFFF00000000 + * count=sem->count + * MB + * sem->count=0xFFFFFFFE00000001 + * -0xFFFFFFFF00000001 + * RMB + * spin_trylock(wait_lock) + * return + * rwsem_try_write_lock(count) + * spin_unlock(wait_lock) + * schedule() + * + * Reordering of atomic_long_sub_return_release() in __up_write() + * and rwsem_has_spinner() in rwsem_wake() can cause missing of + * wakeup in up_write() context. In spinning writer, sem->count + * and local variable count is 0XFFFFFFFE00000001. It would result + * in rwsem_try_write_lock() failing to acquire rwsem and spinning + * writer going to sleep in rwsem_down_write_failed(). + * + * The smp_rmb() here is to make sure that the spinner state is + * consulted after sem->count is updated in up_write context. + */ + smp_rmb(); + + /* * If a spinner is present, it is not necessary to do the wakeup. * Try to do wakeup only if the trylock succeeds to minimize * spinlock contention which may introduce too much delay in the diff --git a/kernel/module.c b/kernel/module.c index ea5ba3e8d472..a0eeedb3e5cd 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2404,7 +2404,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) } if (sym->st_shndx == SHN_UNDEF) return 'U'; - if (sym->st_shndx == SHN_ABS) + if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu) return 'a'; if (sym->st_shndx >= SHN_LORESERVE) return '?'; @@ -2433,7 +2433,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) } static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, - unsigned int shnum) + unsigned int shnum, unsigned int pcpundx) { const Elf_Shdr *sec; @@ -2442,6 +2442,11 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, || !src->st_name) return false; +#ifdef CONFIG_KALLSYMS_ALL + if (src->st_shndx == pcpundx) + return true; +#endif + sec = sechdrs + src->st_shndx; if (!(sec->sh_flags & SHF_ALLOC) #ifndef CONFIG_KALLSYMS_ALL @@ -2479,7 +2484,8 @@ static void layout_symtab(struct module *mod, struct load_info *info) /* Compute total space required for the core symbols' strtab. */ for (ndst = i = 0; i < nsrc; i++) { if (i == 0 || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { strtab_size += strlen(&info->strtab[src[i].st_name])+1; ndst++; } @@ -2537,7 +2543,8 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) src = mod->kallsyms->symtab; for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { if (i == 0 || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { dst[ndst] = src[i]; dst[ndst++].st_name = s - mod->core_kallsyms.strtab; s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], @@ -2868,6 +2875,15 @@ static struct module *setup_load_info(struct load_info *info, int flags) return mod; } +static void check_modinfo_retpoline(struct module *mod, struct load_info *info) +{ + if (retpoline_module_ok(get_modinfo(info, "retpoline"))) + return; + + pr_warn("%s: loading module not compiled with retpoline compiler.\n", + mod->name); +} + static int check_modinfo(struct module *mod, struct load_info *info, int flags) { const char *modmagic = get_modinfo(info, "vermagic"); @@ -2887,8 +2903,14 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) return -ENOEXEC; } - if (!get_modinfo(info, "intree")) + if (!get_modinfo(info, "intree")) { + if (!test_taint(TAINT_OOT_MODULE)) + pr_warn("%s: loading out-of-tree module taints kernel.\n", + mod->name); add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); + } + + check_modinfo_retpoline(mod, info); if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); @@ -3053,6 +3075,8 @@ static int move_module(struct module *mod, struct load_info *info) static int check_module_license_and_versions(struct module *mod) { + int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE); + /* * ndiswrapper is under GPL by itself, but loads proprietary modules. * Don't use add_taint_module(), as it would prevent ndiswrapper from @@ -3071,6 +3095,9 @@ static int check_module_license_and_versions(struct module *mod) add_taint_module(mod, TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE); + if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE)) + pr_warn("%s: module license taints kernel.\n", mod->name); + #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) diff --git a/kernel/panic.c b/kernel/panic.c index 982a52352cfc..75f564a94a82 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -24,6 +24,7 @@ #include <linux/init.h> #include <linux/nmi.h> #include <linux/console.h> +#include <soc/qcom/minidump.h> #define CREATE_TRACE_POINTS #include <trace/events/exception.h> @@ -108,6 +109,7 @@ void panic(const char *fmt, ...) va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); + dump_stack_minidump(0); pr_emerg("Kernel panic - not syncing: %s\n", buf); #ifdef CONFIG_DEBUG_BUGVERBOSE /* @@ -172,7 +174,7 @@ void panic(const char *fmt, ...) * Delay timeout seconds before rebooting the machine. * We can't use the "normal" timers since we just panicked. */ - pr_emerg("Rebooting in %d seconds..", panic_timeout); + pr_emerg("Rebooting in %d seconds..\n", panic_timeout); for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { touch_nmi_watchdog(); diff --git a/kernel/power/process.c b/kernel/power/process.c index e7f1f736a5b6..cc177142a08f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -19,8 +19,9 @@ #include <linux/kmod.h> #include <trace/events/power.h> #include <linux/wakeup_reason.h> +#include <linux/cpuset.h> -/* +/* * Timeout for stopping processes */ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; @@ -208,6 +209,8 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); + cpuset_wait_for_hotplug(); + read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9fcb521fab0e..dca87791e9c1 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3180,9 +3180,8 @@ void show_regs_print_info(const char *log_lvl) { dump_stack_print_info(log_lvl); - printk("%stask: %p ti: %p task.ti: %p\n", - log_lvl, current, current_thread_info(), - task_thread_info(current)); + printk("%stask: %p task.stack: %p\n", + log_lvl, current, task_stack_page(current)); } #endif diff --git a/kernel/profile.c b/kernel/profile.c index 99513e1160e5..9cd8e18e6f18 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -44,7 +44,7 @@ int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); static cpumask_var_t prof_cpu_mask; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); static DEFINE_MUTEX(profile_flip_mutex); @@ -201,7 +201,7 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n) } EXPORT_SYMBOL_GPL(profile_event_unregister); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) /* * Each cpu has a pair of open-addressed hashtables for pending * profile hits. read_profile() IPI's all cpus to request them diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c7e8ed99c953..5e2cd1030702 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -28,19 +28,25 @@ #include <linux/compat.h> +void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, + const struct cred *ptracer_cred) +{ + BUG_ON(!list_empty(&child->ptrace_entry)); + list_add(&child->ptrace_entry, &new_parent->ptraced); + child->parent = new_parent; + child->ptracer_cred = get_cred(ptracer_cred); +} + /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. * * Must be called with the tasklist lock write-held. */ -void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) +static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) { - BUG_ON(!list_empty(&child->ptrace_entry)); - list_add(&child->ptrace_entry, &new_parent->ptraced); - child->parent = new_parent; rcu_read_lock(); - child->ptracer_cred = get_cred(__task_cred(new_parent)); + __ptrace_link(child, new_parent, __task_cred(new_parent)); rcu_read_unlock(); } @@ -353,7 +359,7 @@ static int ptrace_attach(struct task_struct *task, long request, flags |= PT_SEIZED; task->ptrace = flags; - __ptrace_link(task, current); + ptrace_link(task, current); /* SEIZE doesn't trap tracee on attach */ if (!seize) @@ -420,7 +426,7 @@ static int ptrace_traceme(void) */ if (!ret && !(current->real_parent->flags & PF_EXITING)) { current->ptrace = PT_PTRACED; - __ptrace_link(current, current->real_parent); + ptrace_link(current, current->real_parent); } } write_unlock_irq(&tasklist_lock); diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 61a16569ffbf..032b2c015beb 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,3 +1,7 @@ +# Any varying coverage in these files is non-deterministic +# and is generally not a function of system call inputs. +KCOV_INSTRUMENT := n + obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2cb46d51d715..3decfbc88308 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -248,24 +248,17 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { - unsigned long flags; - - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) { - trace_rcu_grace_period(TPS("rcu_sched"), - __this_cpu_read(rcu_sched_data.gpnum), - TPS("cpuqs")); - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); - if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - local_irq_save(flags); - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), - true); - } - local_irq_restore(flags); - } + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) + return; + trace_rcu_grace_period(TPS("rcu_sched"), + __this_cpu_read(rcu_sched_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); } void rcu_bh_qs(void) @@ -302,17 +295,16 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); * We inform the RCU core by emulating a zero-duration dyntick-idle * period, which we in turn do by incrementing the ->dynticks counter * by two. + * + * The caller must have disabled interrupts. */ static void rcu_momentary_dyntick_idle(void) { - unsigned long flags; struct rcu_data *rdp; struct rcu_dynticks *rdtp; int resched_mask; struct rcu_state *rsp; - local_irq_save(flags); - /* * Yes, we can lose flag-setting operations. This is OK, because * the flag will be set again after some delay. @@ -342,13 +334,12 @@ static void rcu_momentary_dyntick_idle(void) smp_mb__after_atomic(); /* Later stuff after QS. */ break; } - local_irq_restore(flags); } /* * Note a context switch. This is a quiescent state for RCU-sched, * and requires special handling for preemptible RCU. - * The caller must have disabled preemption. + * The caller must have disabled interrupts. */ void rcu_note_context_switch(void) { @@ -378,9 +369,14 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); */ void rcu_all_qs(void) { + unsigned long flags; + barrier(); /* Avoid RCU read-side critical sections leaking down. */ - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { + local_irq_save(flags); rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + } this_cpu_inc(rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } @@ -761,6 +757,12 @@ void rcu_irq_exit(void) local_irq_save(flags); rdtp = this_cpu_ptr(&rcu_dynticks); + + /* Page faults can happen in NMI handlers, so check... */ + if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + return; + + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && @@ -889,6 +891,12 @@ void rcu_irq_enter(void) local_irq_save(flags); rdtp = this_cpu_ptr(&rcu_dynticks); + + /* Page faults can happen in NMI handlers, so check... */ + if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + return; + + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 32cbe72bf545..c6fc11d626f8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -147,8 +147,8 @@ static void __init rcu_bootup_announce(void) * the corresponding expedited grace period will also be the end of the * normal grace period. */ -static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long flags) __releases(rnp->lock) +static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) + __releases(rnp->lock) /* But leaves rrupts disabled. */ { int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + @@ -236,7 +236,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; - raw_spin_unlock(&rnp->lock); + raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */ /* * Report the quiescent state for the expedited GP. This expedited @@ -251,7 +251,6 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, } else { WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); } - local_irq_restore(flags); } /* @@ -286,12 +285,11 @@ static void rcu_preempt_qs(void) * predating the current grace period drain, in other words, until * rnp->gp_tasks becomes NULL. * - * Caller must disable preemption. + * Caller must disable interrupts. */ static void rcu_preempt_note_context_switch(void) { struct task_struct *t = current; - unsigned long flags; struct rcu_data *rdp; struct rcu_node *rnp; @@ -301,7 +299,7 @@ static void rcu_preempt_note_context_switch(void) /* Possibly blocking in an RCU read-side critical section. */ rdp = this_cpu_ptr(rcu_state_p->rda); rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock(&rnp->lock); smp_mb__after_unlock_lock(); t->rcu_read_unlock_special.b.blocked = true; t->rcu_blocked_node = rnp; @@ -318,7 +316,7 @@ static void rcu_preempt_note_context_switch(void) (rnp->qsmask & rdp->grpmask) ? rnp->gpnum : rnp->gpnum + 1); - rcu_preempt_ctxt_queue(rnp, rdp, flags); + rcu_preempt_ctxt_queue(rnp, rdp); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special.s) { diff --git a/kernel/resource.c b/kernel/resource.c index 4c9835c09dcd..c09d484f7b5f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -105,16 +105,25 @@ static int r_show(struct seq_file *m, void *v) { struct resource *root = m->private; struct resource *r = v, *p; + unsigned long long start, end; int width = root->end < 0x10000 ? 4 : 8; int depth; for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) if (p->parent == root) break; + + if (file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) { + start = r->start; + end = r->end; + } else { + start = end = 0; + } + seq_printf(m, "%*s%0*llx-%0*llx : %s\n", depth * 2, "", - width, (unsigned long long) r->start, - width, (unsigned long long) r->end, + width, start, + width, end, r->name ? r->name : "<BAD>"); return 0; } diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 308f80ce2e43..7dde1b9918e4 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) endif +# These files are disabled because they produce non-interesting flaky coverage +# that is not a function of syscall inputs. E.g. involuntary context switches. +KCOV_INSTRUMENT := n + ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond @@ -22,4 +26,5 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_SCHED_TUNE) += tune.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_SCHED_CORE_CTL) += core_ctl.o -obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o +obj-$(CONFIG_CPU_FREQ) += cpufreq.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 750ed601ddf7..8620fd01b3d0 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -111,14 +111,11 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) { if (tg != &root_task_group) return false; - /* - * We can only assume the task group can't go away on us if - * autogroup_move_group() can see us on ->thread_group list. + * If we race with autogroup_move_group() the caller can use the old + * value of signal->autogroup but in this case sched_move_task() will + * be called again before autogroup_kref_put(). */ - if (p->flags & PF_EXITING) - return false; - return true; } @@ -138,13 +135,17 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) } p->signal->autogroup = autogroup_kref_get(ag); - - if (!READ_ONCE(sysctl_sched_autogroup_enabled)) - goto out; - + /* + * We can't avoid sched_move_task() after we changed signal->autogroup, + * this process can already run with task_group() == prev->tg or we can + * race with cgroup code which can read autogroup = prev under rq->lock. + * In the latter case for_each_thread() can not miss a migrating thread, + * cpu_cgroup_attach() must not be possible after cgroup_exit() and it + * can't be removed from thread list, we hold ->siglock. + */ for_each_thread(p, t) sched_move_task(t); -out: + unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d2b8834dd3b..03b59c330bdd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -33,7 +33,7 @@ #include <linux/init.h> #include <linux/uaccess.h> #include <linux/highmem.h> -#include <asm/mmu_context.h> +#include <linux/mmu_context.h> #include <linux/interrupt.h> #include <linux/capability.h> #include <linux/completion.h> @@ -554,6 +554,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) return; + head->count++; + get_task_struct(task); /* @@ -563,6 +565,10 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) head->lastp = &node->next; } +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, + int sibling_count_hint); + void wake_up_q(struct wake_q_head *head) { struct wake_q_node *node = head->first; @@ -577,10 +583,10 @@ void wake_up_q(struct wake_q_head *head) task->wake_q.next = NULL; /* - * wake_up_process() implies a wmb() to pair with the queueing + * try_to_wake_up() implies a wmb() to pair with the queueing * in wake_q_add() so as not to miss wakeups. */ - wake_up_process(task); + try_to_wake_up(task, TASK_NORMAL, 0, head->count); put_task_struct(task); } } @@ -621,8 +627,7 @@ void resched_cpu(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - if (!raw_spin_trylock_irqsave(&rq->lock, flags)) - return; + raw_spin_lock_irqsave(&rq->lock, flags); resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -1373,7 +1378,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(p, cpu); + p->on_rq = TASK_ON_RQ_QUEUED; activate_task(dst_rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); @@ -1701,14 +1708,16 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags, + int sibling_count_hint) { bool allow_isolated = (p->flags & PF_KTHREAD); lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags, + sibling_count_hint); /* * In order not to call set_task_cpu() on a blocking task we need @@ -2006,6 +2015,8 @@ static void ttwu_queue(struct task_struct *p, int cpu) * @p: the thread to be awakened * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) + * @sibling_count_hint: A hint at the number of threads that are being woken up + * in this event. * * Put it on the run-queue if it's not already there. The "current" * thread is always on the run-queue (except when the actual @@ -2017,7 +2028,8 @@ static void ttwu_queue(struct task_struct *p, int cpu) * or @state didn't match @p's state. */ static int -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, + int sibling_count_hint) { unsigned long flags; int cpu, src_cpu, success = 0; @@ -2133,7 +2145,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (p->sched_class->task_waking) p->sched_class->task_waking(p); - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags, + sibling_count_hint); /* Refresh src_cpu as it could have changed since we last read it */ src_cpu = task_cpu(p); @@ -2235,7 +2248,7 @@ out: */ int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_NORMAL, 0); + return try_to_wake_up(p, TASK_NORMAL, 0, 1); } EXPORT_SYMBOL(wake_up_process); @@ -2255,13 +2268,13 @@ EXPORT_SYMBOL(wake_up_process); int wake_up_process_no_notif(struct task_struct *p) { WARN_ON(task_is_stopped_or_traced(p)); - return try_to_wake_up(p, TASK_NORMAL, WF_NO_NOTIFIER); + return try_to_wake_up(p, TASK_NORMAL, WF_NO_NOTIFIER, 1); } EXPORT_SYMBOL(wake_up_process_no_notif); int wake_up_state(struct task_struct *p, unsigned int state) { - return try_to_wake_up(p, state, 0); + return try_to_wake_up(p, state, 0, 1); } /* @@ -2276,6 +2289,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_period = 0; dl_se->flags = 0; dl_se->dl_bw = 0; + dl_se->dl_density = 0; dl_se->dl_throttled = 0; dl_se->dl_new = 1; @@ -2312,11 +2326,11 @@ void sched_exit(struct task_struct *p) reset_task_stats(p); p->ravg.mark_start = wallclock; p->ravg.sum_history[0] = EXITING_TASK_MARKER; - free_task_load_ptrs(p); enqueue_task(rq, p, 0); clear_ed_task(p, rq); task_rq_unlock(rq, p, &flags); + free_task_load_ptrs(p); } #endif /* CONFIG_SCHED_HMP */ @@ -2336,9 +2350,16 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; +#ifdef CONFIG_SCHED_WALT + p->last_sleep_ts = 0; +#endif INIT_LIST_HEAD(&p->se.group_node); +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = NULL; +#endif + #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif @@ -2347,6 +2368,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) init_dl_task_timer(&p->dl); __dl_clear_params(p); + init_rt_schedtune_timer(&p->rt); INIT_LIST_HEAD(&p->rt.run_list); p->rt.timeout = 0; p->rt.time_slice = sched_rr_timeslice; @@ -2428,11 +2450,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) __sched_fork(clone_flags, p); /* - * We mark the process as running here. This guarantees that + * We mark the process as NEW here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ - p->state = TASK_RUNNING; + p->state = TASK_NEW; /* * Make sure we do not leak PI boosting priority to the child. @@ -2469,8 +2491,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; } - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); + init_entity_runnable_average(&p->se); /* * The child is not yet in the pid-hash so no cgroup attach races, @@ -2480,7 +2501,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); - set_task_cpu(p, cpu); + /* + * We're setting the cpu for the first time, we don't migrate, + * so use __set_task_cpu(). + */ + __set_task_cpu(p, cpu); + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); #ifdef CONFIG_SCHED_INFO @@ -2613,6 +2640,8 @@ void wake_up_new_task(struct task_struct *p) add_new_task_to_grp(p); raw_spin_lock_irqsave(&p->pi_lock, flags); + p->state = TASK_RUNNING; + /* Initialize new task's runnable average */ init_entity_runnable_average(&p->se); #ifdef CONFIG_SMP @@ -2620,12 +2649,16 @@ void wake_up_new_task(struct task_struct *p) * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug + * + * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, + * as we're not fully set-up yet. */ - set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1)); #endif - rq = __task_rq_lock(p); mark_task_starting(p); + update_rq_clock(rq); + post_init_entity_util_avg(&p->se); activate_task(rq, p, ENQUEUE_WAKEUP_NEW); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); @@ -2931,7 +2964,7 @@ context_switch(struct rq *rq, struct task_struct *prev, atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else - switch_mm(oldmm, mm, next); + switch_mm_irqs_off(oldmm, mm, next); if (!prev->mm) { prev->active_mm = NULL; @@ -3070,7 +3103,7 @@ void sched_exec(void) raw_spin_lock_irqsave(&p->pi_lock, flags); curr_cpu = task_cpu(p); - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1); if (dest_cpu == smp_processor_id()) goto unlock; @@ -3136,93 +3169,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -#ifdef CONFIG_CPU_FREQ_GOV_SCHED - -static inline -unsigned long add_capacity_margin(unsigned long cpu_capacity) -{ - cpu_capacity = cpu_capacity * capacity_margin; - cpu_capacity /= SCHED_CAPACITY_SCALE; - return cpu_capacity; -} - -static inline -unsigned long sum_capacity_reqs(unsigned long cfs_cap, - struct sched_capacity_reqs *scr) -{ - unsigned long total = add_capacity_margin(cfs_cap + scr->rt); - return total += scr->dl; -} - -static void sched_freq_tick_pelt(int cpu) -{ - unsigned long cpu_utilization = capacity_max; - unsigned long capacity_curr = capacity_curr_of(cpu); - struct sched_capacity_reqs *scr; - - scr = &per_cpu(cpu_sched_capacity_reqs, cpu); - if (sum_capacity_reqs(cpu_utilization, scr) < capacity_curr) - return; - - /* - * To make free room for a task that is building up its "real" - * utilization and to harm its performance the least, request - * a jump to a higher OPP as soon as the margin of free capacity - * is impacted (specified by capacity_margin). - */ - set_cfs_cpu_capacity(cpu, true, cpu_utilization); -} - -#ifdef CONFIG_SCHED_WALT -static void sched_freq_tick_walt(int cpu) -{ - unsigned long cpu_utilization = cpu_util(cpu); - unsigned long capacity_curr = capacity_curr_of(cpu); - - if (walt_disabled || !sysctl_sched_use_walt_cpu_util) - return sched_freq_tick_pelt(cpu); - - /* - * Add a margin to the WALT utilization. - * NOTE: WALT tracks a single CPU signal for all the scheduling - * classes, thus this margin is going to be added to the DL class as - * well, which is something we do not do in sched_freq_tick_pelt case. - */ - cpu_utilization = add_capacity_margin(cpu_utilization); - if (cpu_utilization <= capacity_curr) - return; - - /* - * It is likely that the load is growing so we - * keep the added margin in our request as an - * extra boost. - */ - set_cfs_cpu_capacity(cpu, true, cpu_utilization); - -} -#define _sched_freq_tick(cpu) sched_freq_tick_walt(cpu) -#else -#define _sched_freq_tick(cpu) sched_freq_tick_pelt(cpu) -#endif /* CONFIG_SCHED_WALT */ - -static void sched_freq_tick(int cpu) -{ - unsigned long capacity_orig, capacity_curr; - - if (!sched_freq()) - return; - - capacity_orig = capacity_orig_of(cpu); - capacity_curr = capacity_curr_of(cpu); - if (capacity_curr == capacity_orig) - return; - - _sched_freq_tick(cpu); -} -#else -static inline void sched_freq_tick(int cpu) { } -#endif /* CONFIG_CPU_FREQ_GOV_SCHED */ - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -3249,7 +3195,6 @@ void scheduler_tick(void) wallclock = sched_ktime_clock(); update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); early_notif = early_detection_notify(rq, wallclock); - sched_freq_tick(cpu); raw_spin_unlock(&rq->lock); if (early_notif) @@ -3511,7 +3456,6 @@ static void __sched notrace __schedule(bool preempt) cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_note_context_switch(); prev = rq->curr; /* @@ -3530,13 +3474,16 @@ static void __sched notrace __schedule(bool preempt) if (sched_feat(HRTICK)) hrtick_clear(rq); + local_irq_disable(); + rcu_note_context_switch(); + /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - raw_spin_lock_irq(&rq->lock); + raw_spin_lock(&rq->lock); lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ @@ -3582,6 +3529,10 @@ static void __sched notrace __schedule(bool preempt) if (!is_idle_task(prev) && !prev->on_rq) update_avg_burst(prev); +#ifdef CONFIG_SCHED_WALT + if (!prev->on_rq) + prev->last_sleep_ts = wallclock; +#endif rq->nr_switches++; rq->curr = next; ++*switch_count; @@ -3758,7 +3709,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, void *key) { - return try_to_wake_up(curr->private, mode, wake_flags); + return try_to_wake_up(curr->private, mode, wake_flags, 1); } EXPORT_SYMBOL(default_wake_function); @@ -3784,6 +3735,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) BUG_ON(prio > MAX_PRIO); rq = __task_rq_lock(p); + update_rq_clock(rq); /* * Idle task boosting is a nono in general. There is one @@ -3879,6 +3831,8 @@ void set_user_nice(struct task_struct *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); + update_rq_clock(rq); + /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -4039,6 +3993,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; dl_se->flags = attr->sched_flags; dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); /* * Changing the parameters of a task is 'tricky' and we're not doing @@ -4306,6 +4261,7 @@ recheck: * runqueue lock must be held. */ rq = task_rq_lock(p, &flags); + update_rq_clock(rq); /* * Changing the policy of the stop threads its a very bad idea @@ -4961,6 +4917,15 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); + + /* + * The userspace tasks are forbidden to run on + * isolated CPUs. So exclude isolated CPUs from + * the getaffinity. + */ + if (!(p->flags & PF_KTHREAD)) + cpumask_andnot(mask, mask, cpu_isolated_mask); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: @@ -6460,9 +6425,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); - if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); return -1; } @@ -6555,8 +6517,12 @@ static inline bool sched_debug(void) static int sd_degenerate(struct sched_domain *sd) { - if (cpumask_weight(sched_domain_span(sd)) == 1) - return 1; + if (cpumask_weight(sched_domain_span(sd)) == 1) { + if (sd->groups->sge) + sd->flags &= ~SD_LOAD_BALANCE; + else + return 1; + } /* Following flags need at least 2 groups */ if (sd->flags & (SD_LOAD_BALANCE | @@ -6564,6 +6530,7 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | + SD_ASYM_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | SD_SHARE_CAP_STATES)) { @@ -6595,11 +6562,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | + SD_ASYM_CPUCAPACITY | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | SD_SHARE_POWERDOMAIN | SD_SHARE_CAP_STATES); + if (parent->groups->sge) { + parent->flags &= ~SD_LOAD_BALANCE; + return 0; + } if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -6659,6 +6631,19 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) call_rcu_sched(&old_rd->rcu, free_rootdomain); } +void sched_get_rd(struct root_domain *rd) +{ + atomic_inc(&rd->refcount); +} + +void sched_put_rd(struct root_domain *rd) +{ + if (!atomic_dec_and_test(&rd->refcount)) + return; + + call_rcu_sched(&rd->rcu, free_rootdomain); +} + static int init_rootdomain(struct root_domain *rd) { memset(rd, 0, sizeof(*rd)); @@ -6672,6 +6657,12 @@ static int init_rootdomain(struct root_domain *rd) if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; +#ifdef HAVE_RT_PUSH_IPI + rd->rto_cpu = -1; + raw_spin_lock_init(&rd->rto_lock); + init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); +#endif + init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_dlo_mask; @@ -6680,6 +6671,9 @@ static int init_rootdomain(struct root_domain *rd) goto free_rto_mask; init_max_cpu_capacity(&rd->max_cpu_capacity); + + rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1; + return 0; free_rto_mask: @@ -6913,6 +6907,9 @@ enum s_alloc { * Build an iteration mask that can exclude certain CPUs from the upwards * domain traversal. * + * Only CPUs that can arrive at this group should be considered to continue + * balancing. + * * Asymmetric node setups can result in situations where the domain tree is of * unequal depth, make sure to skip domains that already cover the entire * range. @@ -6924,18 +6921,31 @@ enum s_alloc { */ static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) { - const struct cpumask *span = sched_domain_span(sd); + const struct cpumask *sg_span = sched_group_cpus(sg); struct sd_data *sdd = sd->private; struct sched_domain *sibling; int i; - for_each_cpu(i, span) { + for_each_cpu(i, sg_span) { sibling = *per_cpu_ptr(sdd->sd, i); - if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + + /* + * Can happen in the asymmetric case, where these siblings are + * unused. The mask will not be empty because those CPUs that + * do have the top domain _should_ span the domain. + */ + if (!sibling->child) + continue; + + /* If we would not end up here, we can't continue from here */ + if (!cpumask_equal(sg_span, sched_domain_span(sibling->child))) continue; cpumask_set_cpu(i, sched_group_mask(sg)); } + + /* We must not have empty masks here */ + WARN_ON_ONCE(cpumask_empty(sched_group_mask(sg))); } /* @@ -6996,6 +7006,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; + sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; /* * Make sure the first group of this domain contains the @@ -7291,11 +7302,19 @@ static int sched_domains_curr_level; /* * SD_flags allowed in topology descriptions. * - * SD_SHARE_CPUCAPACITY - describes SMT topologies - * SD_SHARE_PKG_RESOURCES - describes shared caches - * SD_NUMA - describes NUMA topologies - * SD_SHARE_POWERDOMAIN - describes shared power domain - * SD_SHARE_CAP_STATES - describes shared capacity states + * These flags are purely descriptive of the topology and do not prescribe + * behaviour. Behaviour is artificial and mapped in the below sd_init() + * function: + * + * SD_SHARE_CPUCAPACITY - describes SMT topologies + * SD_SHARE_PKG_RESOURCES - describes shared caches + * SD_NUMA - describes NUMA topologies + * SD_SHARE_POWERDOMAIN - describes shared power domain + * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies + * SD_SHARE_CAP_STATES - describes shared capacity states + * + * Odd one out, which beside describing the topology has a quirk also + * prescribes the desired behaviour that goes along with it: * * Odd one out: * SD_ASYM_PACKING - describes SMT quirks @@ -7305,11 +7324,13 @@ static int sched_domains_curr_level; SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ + SD_ASYM_CPUCAPACITY | \ SD_SHARE_POWERDOMAIN | \ SD_SHARE_CAP_STATES) static struct sched_domain * -sd_init(struct sched_domain_topology_level *tl, int cpu) +sd_init(struct sched_domain_topology_level *tl, + struct sched_domain *child, int cpu) { struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); int sd_weight, sd_flags = 0; @@ -7361,6 +7382,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) .smt_gain = 0, .max_newidle_lb_cost = 0, .next_decay_max_lb_cost = jiffies, + .child = child, #ifdef CONFIG_SCHED_DEBUG .name = tl->name, #endif @@ -7370,6 +7392,13 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) * Convert topological properties into behaviour. */ + if (sd->flags & SD_ASYM_CPUCAPACITY) { + struct sched_domain *t = sd; + + for_each_lower_domain(t) + t->flags |= SD_BALANCE_WAKE; + } + if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; @@ -7816,16 +7845,13 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) { - struct sched_domain *sd = sd_init(tl, cpu); - if (!sd) - return child; + struct sched_domain *sd = sd_init(tl, child, cpu); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; - sd->child = child; if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { @@ -7859,7 +7885,6 @@ static int build_sched_domains(const struct cpumask *cpu_map, enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; - struct rq *rq = NULL; int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); @@ -7877,8 +7902,6 @@ static int build_sched_domains(const struct cpumask *cpu_map, *per_cpu_ptr(d.sd, i) = sd; if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd->flags |= SD_OVERLAP; - if (cpumask_equal(cpu_map, sched_domain_span(sd))) - break; } } @@ -7914,8 +7937,19 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { - rq = cpu_rq(i); + int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu); + int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu); + + if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig > + cpu_rq(max_cpu)->cpu_capacity_orig)) + WRITE_ONCE(d.rd->max_cap_orig_cpu, i); + + if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig < + cpu_rq(min_cpu)->cpu_capacity_orig)) + WRITE_ONCE(d.rd->min_cap_orig_cpu, i); + sd = *per_cpu_ptr(d.sd, i); + cpu_attach_domain(sd, d.rd, i); } rcu_read_unlock(); @@ -8136,17 +8170,16 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, * operation in the resume sequence, just build a single sched * domain, ignoring cpusets. */ - num_cpus_frozen--; - if (likely(num_cpus_frozen)) { - partition_sched_domains(1, NULL, NULL); + partition_sched_domains(1, NULL, NULL); + if (--num_cpus_frozen) break; - } /* * This is the last CPU online operation. So fall through and * restore the original sched domains by considering the * cpuset configurations. */ + cpuset_force_rebuild(); case CPU_ONLINE: cpuset_update_active_cpus(true); @@ -8339,6 +8372,7 @@ void __init sched_init(void) #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* * How much cpu bandwidth does root_task_group get? * @@ -8737,27 +8771,9 @@ void sched_offline_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -/* change task's runqueue when it moves between groups. - * The caller of this function should have put the task in its new group - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - * reflect its new group. - */ -void sched_move_task(struct task_struct *tsk) +static void sched_change_group(struct task_struct *tsk, int type) { struct task_group *tg; - int queued, running; - unsigned long flags; - struct rq *rq; - - rq = task_rq_lock(tsk, &flags); - - running = task_current(rq, tsk); - queued = task_on_rq_queued(tsk); - - if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); - if (unlikely(running)) - put_prev_task(rq, tsk); /* * All callers are synchronized by task_rq_lock(); we do not use RCU @@ -8770,11 +8786,37 @@ void sched_move_task(struct task_struct *tsk) tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk); + if (tsk->sched_class->task_change_group) + tsk->sched_class->task_change_group(tsk, type); else #endif set_task_rq(tsk, task_cpu(tsk)); +} + +/* + * Change task's runqueue when it moves between groups. + * + * The caller of this function should have put the task in its new group by + * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect + * its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ + int queued, running; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(tsk, &flags); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + if (queued) + dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); + if (unlikely(running)) + put_prev_task(rq, tsk); + + sched_change_group(tsk, TASK_MOVE_GROUP); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); @@ -9180,11 +9222,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); - sched_online_group(tg, parent); - return &tg->css; } +/* Expose task group only after completing cgroup initialization */ +static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + struct task_group *parent = css_tg(css->parent); + + if (parent) + sched_online_group(tg, parent); + return 0; +} + static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); @@ -9202,15 +9253,28 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_free_group(tg); } +/* + * This is called before wake_up_new_task(), therefore we really only + * have to set its group bits, all the other stuff does not apply. + */ static void cpu_cgroup_fork(struct task_struct *task, void *private) { - sched_move_task(task); + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(task, &flags); + + update_rq_clock(rq); + sched_change_group(task, TASK_SET_GROUP); + + task_rq_unlock(rq, task, &flags); } static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; + int ret = 0; cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED @@ -9221,8 +9285,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (task->sched_class != &fair_sched_class) return -EINVAL; #endif + /* + * Serialize against wake_up_new_task() such that if its + * running, we're sure to observe its full state. + */ + raw_spin_lock_irq(&task->pi_lock); + /* + * Avoid calling sched_move_task() before wake_up_new_task() + * has happened. This would lead to problems with PELT, due to + * move wanting to detach+attach while we're not attached yet. + */ + if (task->state == TASK_NEW) + ret = -EINVAL; + raw_spin_unlock_irq(&task->pi_lock); + + if (ret) + break; } - return 0; + return ret; } static void cpu_cgroup_attach(struct cgroup_taskset *tset) @@ -9566,6 +9646,7 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, + .css_online = cpu_cgroup_css_online, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .fork = cpu_cgroup_fork, diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c new file mode 100644 index 000000000000..dbc51442ecbc --- /dev/null +++ b/kernel/sched/cpufreq.c @@ -0,0 +1,63 @@ +/* + * Scheduler code and data structures related to cpufreq. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "sched.h" + +DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. + * @cpu: The CPU to set the pointer for. + * @data: New pointer value. + * @func: Callback function to set for the CPU. + * + * Set and publish the update_util_data pointer for the given CPU. + * + * The update_util_data pointer of @cpu is set to @data and the callback + * function pointer in the target struct update_util_data is set to @func. + * That function will be called by cpufreq_update_util() from RCU-sched + * read-side critical sections, so it must not sleep. @data will always be + * passed to it as the first argument which allows the function to get to the + * target update_util_data structure and its container. + * + * The update_util_data pointer of @cpu must be NULL when this function is + * called or it will WARN() and return with no effect. + */ +void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, + void (*func)(struct update_util_data *data, u64 time, + unsigned int flags)) +{ + if (WARN_ON(!data || !func)) + return; + + if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu))) + return; + + data->func = func; + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); +} +EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook); + +/** + * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer. + * @cpu: The CPU to clear the pointer for. + * + * Clear the update_util_data pointer for the given CPU. + * + * Callers must use RCU-sched callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_sched() + * right after this function to avoid use-after-free. + */ +void cpufreq_remove_update_util_hook(int cpu) +{ + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL); +} +EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook); diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c deleted file mode 100644 index d751bc2d0d6e..000000000000 --- a/kernel/sched/cpufreq_sched.c +++ /dev/null @@ -1,499 +0,0 @@ -/* - * Copyright (C) 2015 Michael Turquette <mturquette@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/cpufreq.h> -#include <linux/module.h> -#include <linux/kthread.h> -#include <linux/percpu.h> -#include <linux/irq_work.h> -#include <linux/delay.h> -#include <linux/string.h> - -#define CREATE_TRACE_POINTS -#include <trace/events/cpufreq_sched.h> - -#include "sched.h" - -#define THROTTLE_DOWN_NSEC 50000000 /* 50ms default */ -#define THROTTLE_UP_NSEC 500000 /* 500us default */ - -struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE; -static bool __read_mostly cpufreq_driver_slow; - -#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED -static struct cpufreq_governor cpufreq_gov_sched; -#endif - -static DEFINE_PER_CPU(unsigned long, enabled); -DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); - -/** - * gov_data - per-policy data internal to the governor - * @up_throttle: next throttling period expiry if increasing OPP - * @down_throttle: next throttling period expiry if decreasing OPP - * @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP - * @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP - * @task: worker thread for dvfs transition that may block/sleep - * @irq_work: callback used to wake up worker thread - * @requested_freq: last frequency requested by the sched governor - * - * struct gov_data is the per-policy cpufreq_sched-specific data structure. A - * per-policy instance of it is created when the cpufreq_sched governor receives - * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data - * member of struct cpufreq_policy. - * - * Readers of this data must call down_read(policy->rwsem). Writers must - * call down_write(policy->rwsem). - */ -struct gov_data { - ktime_t up_throttle; - ktime_t down_throttle; - unsigned int up_throttle_nsec; - unsigned int down_throttle_nsec; - struct task_struct *task; - struct irq_work irq_work; - unsigned int requested_freq; -}; - -static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, - unsigned int freq) -{ - struct gov_data *gd = policy->governor_data; - - /* avoid race with cpufreq_sched_stop */ - if (!down_write_trylock(&policy->rwsem)) - return; - - __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); - - gd->up_throttle = ktime_add_ns(ktime_get(), gd->up_throttle_nsec); - gd->down_throttle = ktime_add_ns(ktime_get(), gd->down_throttle_nsec); - up_write(&policy->rwsem); -} - -static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq) -{ - ktime_t now = ktime_get(); - - ktime_t throttle = gd->requested_freq < cur_freq ? - gd->down_throttle : gd->up_throttle; - - if (ktime_after(now, throttle)) - return false; - - while (1) { - int usec_left = ktime_to_ns(ktime_sub(throttle, now)); - - usec_left /= NSEC_PER_USEC; - trace_cpufreq_sched_throttled(usec_left); - usleep_range(usec_left, usec_left + 100); - now = ktime_get(); - if (ktime_after(now, throttle)) - return true; - } -} - -/* - * we pass in struct cpufreq_policy. This is safe because changing out the - * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP), - * which tears down all of the data structures and __cpufreq_governor(policy, - * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the - * new policy pointer - */ -static int cpufreq_sched_thread(void *data) -{ - struct sched_param param; - struct cpufreq_policy *policy; - struct gov_data *gd; - unsigned int new_request = 0; - unsigned int last_request = 0; - int ret; - - policy = (struct cpufreq_policy *) data; - gd = policy->governor_data; - - param.sched_priority = 50; - ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m); - if (ret) { - pr_warn("%s: failed to set SCHED_FIFO\n", __func__); - do_exit(-EINVAL); - } else { - pr_debug("%s: kthread (%d) set to SCHED_FIFO\n", - __func__, gd->task->pid); - } - - do { - new_request = gd->requested_freq; - if (new_request == last_request) { - set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) - break; - schedule(); - } else { - /* - * if the frequency thread sleeps while waiting to be - * unthrottled, start over to check for a newer request - */ - if (finish_last_request(gd, policy->cur)) - continue; - last_request = new_request; - cpufreq_sched_try_driver_target(policy, new_request); - } - } while (!kthread_should_stop()); - - return 0; -} - -static void cpufreq_sched_irq_work(struct irq_work *irq_work) -{ - struct gov_data *gd; - - gd = container_of(irq_work, struct gov_data, irq_work); - if (!gd) - return; - - wake_up_process(gd->task); -} - -static void update_fdomain_capacity_request(int cpu) -{ - unsigned int freq_new, index_new, cpu_tmp; - struct cpufreq_policy *policy; - struct gov_data *gd; - unsigned long capacity = 0; - - /* - * Avoid grabbing the policy if possible. A test is still - * required after locking the CPU's policy to avoid racing - * with the governor changing. - */ - if (!per_cpu(enabled, cpu)) - return; - - policy = cpufreq_cpu_get(cpu); - if (IS_ERR_OR_NULL(policy)) - return; - - if (policy->governor != &cpufreq_gov_sched || - !policy->governor_data) - goto out; - - gd = policy->governor_data; - - /* find max capacity requested by cpus in this policy */ - for_each_cpu(cpu_tmp, policy->cpus) { - struct sched_capacity_reqs *scr; - - scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp); - capacity = max(capacity, scr->total); - } - - /* Convert the new maximum capacity request into a cpu frequency */ - freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; - if (cpufreq_frequency_table_target(policy, policy->freq_table, - freq_new, CPUFREQ_RELATION_L, - &index_new)) - goto out; - freq_new = policy->freq_table[index_new].frequency; - - if (freq_new > policy->max) - freq_new = policy->max; - - if (freq_new < policy->min) - freq_new = policy->min; - - trace_cpufreq_sched_request_opp(cpu, capacity, freq_new, - gd->requested_freq); - if (freq_new == gd->requested_freq) - goto out; - - gd->requested_freq = freq_new; - - /* - * Throttling is not yet supported on platforms with fast cpufreq - * drivers. - */ - if (cpufreq_driver_slow) - irq_work_queue_on(&gd->irq_work, cpu); - else - cpufreq_sched_try_driver_target(policy, freq_new); - -out: - cpufreq_cpu_put(policy); -} - -void update_cpu_capacity_request(int cpu, bool request) -{ - unsigned long new_capacity; - struct sched_capacity_reqs *scr; - - /* The rq lock serializes access to the CPU's sched_capacity_reqs. */ - lockdep_assert_held(&cpu_rq(cpu)->lock); - - scr = &per_cpu(cpu_sched_capacity_reqs, cpu); - - new_capacity = scr->cfs + scr->rt; - new_capacity = new_capacity * capacity_margin - / SCHED_CAPACITY_SCALE; - new_capacity += scr->dl; - - if (new_capacity == scr->total) - return; - - trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity); - - scr->total = new_capacity; - if (request) - update_fdomain_capacity_request(cpu); -} - -static inline void set_sched_freq(void) -{ - static_key_slow_inc(&__sched_freq); -} - -static inline void clear_sched_freq(void) -{ - static_key_slow_dec(&__sched_freq); -} - -static struct attribute_group sched_attr_group_gov_pol; -static struct attribute_group *get_sysfs_attr(void) -{ - return &sched_attr_group_gov_pol; -} - -static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) -{ - struct gov_data *gd; - int cpu; - int rc; - - for_each_cpu(cpu, policy->cpus) - memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0, - sizeof(struct sched_capacity_reqs)); - - gd = kzalloc(sizeof(*gd), GFP_KERNEL); - if (!gd) - return -ENOMEM; - - gd->up_throttle_nsec = policy->cpuinfo.transition_latency ? - policy->cpuinfo.transition_latency : - THROTTLE_UP_NSEC; - gd->down_throttle_nsec = THROTTLE_DOWN_NSEC; - pr_debug("%s: throttle threshold = %u [ns]\n", - __func__, gd->up_throttle_nsec); - - rc = sysfs_create_group(&policy->kobj, get_sysfs_attr()); - if (rc) { - pr_err("%s: couldn't create sysfs attributes: %d\n", __func__, rc); - goto err; - } - - policy->governor_data = gd; - if (cpufreq_driver_is_slow()) { - cpufreq_driver_slow = true; - gd->task = kthread_create(cpufreq_sched_thread, policy, - "kschedfreq:%d", - cpumask_first(policy->related_cpus)); - if (IS_ERR_OR_NULL(gd->task)) { - pr_err("%s: failed to create kschedfreq thread\n", - __func__); - goto err; - } - get_task_struct(gd->task); - kthread_bind_mask(gd->task, policy->related_cpus); - wake_up_process(gd->task); - init_irq_work(&gd->irq_work, cpufreq_sched_irq_work); - } - - set_sched_freq(); - - return 0; - -err: - policy->governor_data = NULL; - kfree(gd); - return -ENOMEM; -} - -static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) -{ - struct gov_data *gd = policy->governor_data; - - clear_sched_freq(); - if (cpufreq_driver_slow) { - kthread_stop(gd->task); - put_task_struct(gd->task); - } - - sysfs_remove_group(&policy->kobj, get_sysfs_attr()); - - policy->governor_data = NULL; - - kfree(gd); - return 0; -} - -static int cpufreq_sched_start(struct cpufreq_policy *policy) -{ - int cpu; - - for_each_cpu(cpu, policy->cpus) - per_cpu(enabled, cpu) = 1; - - return 0; -} - -static void cpufreq_sched_limits(struct cpufreq_policy *policy) -{ - unsigned int clamp_freq; - struct gov_data *gd = policy->governor_data;; - - pr_debug("limit event for cpu %u: %u - %u kHz, currently %u kHz\n", - policy->cpu, policy->min, policy->max, - policy->cur); - - clamp_freq = clamp(gd->requested_freq, policy->min, policy->max); - - if (policy->cur != clamp_freq) - __cpufreq_driver_target(policy, clamp_freq, CPUFREQ_RELATION_L); -} - -static int cpufreq_sched_stop(struct cpufreq_policy *policy) -{ - int cpu; - - for_each_cpu(cpu, policy->cpus) - per_cpu(enabled, cpu) = 0; - - return 0; -} - -static int cpufreq_sched_setup(struct cpufreq_policy *policy, - unsigned int event) -{ - switch (event) { - case CPUFREQ_GOV_POLICY_INIT: - return cpufreq_sched_policy_init(policy); - case CPUFREQ_GOV_POLICY_EXIT: - return cpufreq_sched_policy_exit(policy); - case CPUFREQ_GOV_START: - return cpufreq_sched_start(policy); - case CPUFREQ_GOV_STOP: - return cpufreq_sched_stop(policy); - case CPUFREQ_GOV_LIMITS: - cpufreq_sched_limits(policy); - break; - } - return 0; -} - -/* Tunables */ -static ssize_t show_up_throttle_nsec(struct gov_data *gd, char *buf) -{ - return sprintf(buf, "%u\n", gd->up_throttle_nsec); -} - -static ssize_t store_up_throttle_nsec(struct gov_data *gd, - const char *buf, size_t count) -{ - int ret; - long unsigned int val; - - ret = kstrtoul(buf, 0, &val); - if (ret < 0) - return ret; - gd->up_throttle_nsec = val; - return count; -} - -static ssize_t show_down_throttle_nsec(struct gov_data *gd, char *buf) -{ - return sprintf(buf, "%u\n", gd->down_throttle_nsec); -} - -static ssize_t store_down_throttle_nsec(struct gov_data *gd, - const char *buf, size_t count) -{ - int ret; - long unsigned int val; - - ret = kstrtoul(buf, 0, &val); - if (ret < 0) - return ret; - gd->down_throttle_nsec = val; - return count; -} - -/* - * Create show/store routines - * - sys: One governor instance for complete SYSTEM - * - pol: One governor instance per struct cpufreq_policy - */ -#define show_gov_pol_sys(file_name) \ -static ssize_t show_##file_name##_gov_pol \ -(struct cpufreq_policy *policy, char *buf) \ -{ \ - return show_##file_name(policy->governor_data, buf); \ -} - -#define store_gov_pol_sys(file_name) \ -static ssize_t store_##file_name##_gov_pol \ -(struct cpufreq_policy *policy, const char *buf, size_t count) \ -{ \ - return store_##file_name(policy->governor_data, buf, count); \ -} - -#define gov_pol_attr_rw(_name) \ - static struct freq_attr _name##_gov_pol = \ - __ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol) - -#define show_store_gov_pol_sys(file_name) \ - show_gov_pol_sys(file_name); \ - store_gov_pol_sys(file_name) -#define tunable_handlers(file_name) \ - show_gov_pol_sys(file_name); \ - store_gov_pol_sys(file_name); \ - gov_pol_attr_rw(file_name) - -tunable_handlers(down_throttle_nsec); -tunable_handlers(up_throttle_nsec); - -/* Per policy governor instance */ -static struct attribute *sched_attributes_gov_pol[] = { - &up_throttle_nsec_gov_pol.attr, - &down_throttle_nsec_gov_pol.attr, - NULL, -}; - -static struct attribute_group sched_attr_group_gov_pol = { - .attrs = sched_attributes_gov_pol, - .name = "sched", -}; - -#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED -static -#endif -struct cpufreq_governor cpufreq_gov_sched = { - .name = "sched", - .governor = cpufreq_sched_setup, - .owner = THIS_MODULE, -}; - -static int __init cpufreq_sched_init(void) -{ - int cpu; - - for_each_cpu(cpu, cpu_possible_mask) - per_cpu(enabled, cpu) = 0; - return cpufreq_register_governor(&cpufreq_gov_sched); -} - -/* Try to make this the default governor */ -fs_initcall(cpufreq_sched_init); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c new file mode 100644 index 000000000000..6c84b4d28914 --- /dev/null +++ b/kernel/sched/cpufreq_schedutil.c @@ -0,0 +1,827 @@ +/* + * CPUFreq governor based on scheduler-provided CPU utilization data. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpufreq.h> +#include <linux/kthread.h> +#include <linux/slab.h> +#include <trace/events/power.h> + +#include "sched.h" +#include "tune.h" + +unsigned long boosted_cpu_util(int cpu); + +/* Stub out fast switch routines present on mainline to reduce the backport + * overhead. */ +#define cpufreq_driver_fast_switch(x, y) 0 +#define cpufreq_enable_fast_switch(x) +#define cpufreq_disable_fast_switch(x) +#define LATENCY_MULTIPLIER (1000) +#define SUGOV_KTHREAD_PRIORITY 50 + +struct sugov_tunables { + struct gov_attr_set attr_set; + unsigned int up_rate_limit_us; + unsigned int down_rate_limit_us; +}; + +struct sugov_policy { + struct cpufreq_policy *policy; + + struct sugov_tunables *tunables; + struct list_head tunables_hook; + + raw_spinlock_t update_lock; /* For shared policies */ + u64 last_freq_update_time; + s64 min_rate_limit_ns; + s64 up_rate_delay_ns; + s64 down_rate_delay_ns; + unsigned int next_freq; + unsigned int cached_raw_freq; + + /* The next fields are only needed if fast switch cannot be used. */ + struct irq_work irq_work; + struct kthread_work work; + struct mutex work_lock; + struct kthread_worker worker; + struct task_struct *thread; + bool work_in_progress; + + bool need_freq_update; +}; + +struct sugov_cpu { + struct update_util_data update_util; + struct sugov_policy *sg_policy; + + bool iowait_boost_pending; + unsigned int iowait_boost; + unsigned int iowait_boost_max; + u64 last_update; + + /* The fields below are only needed when sharing a policy. */ + unsigned long util; + unsigned long max; + unsigned int flags; + + /* The field below is for single-CPU policies only. */ +#ifdef CONFIG_NO_HZ_COMMON + unsigned long saved_idle_calls; +#endif +}; + +static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); + +/************************ Governor internals ***********************/ + +static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) +{ + s64 delta_ns; + + if (sg_policy->work_in_progress) + return false; + + if (unlikely(sg_policy->need_freq_update)) { + sg_policy->need_freq_update = false; + /* + * This happens when limits change, so forget the previous + * next_freq value and force an update. + */ + sg_policy->next_freq = UINT_MAX; + return true; + } + + delta_ns = time - sg_policy->last_freq_update_time; + + /* No need to recalculate next freq for min_rate_limit_us at least */ + return delta_ns >= sg_policy->min_rate_limit_ns; +} + +static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + s64 delta_ns; + + delta_ns = time - sg_policy->last_freq_update_time; + + if (next_freq > sg_policy->next_freq && + delta_ns < sg_policy->up_rate_delay_ns) + return true; + + if (next_freq < sg_policy->next_freq && + delta_ns < sg_policy->down_rate_delay_ns) + return true; + + return false; +} + +static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + struct cpufreq_policy *policy = sg_policy->policy; + + if (sugov_up_down_rate_limit(sg_policy, time, next_freq)) { + /* Reset cached freq as next_freq isn't changed */ + sg_policy->cached_raw_freq = 0; + return; + } + + if (sg_policy->next_freq == next_freq) + return; + + sg_policy->next_freq = next_freq; + sg_policy->last_freq_update_time = time; + + if (policy->fast_switch_enabled) { + next_freq = cpufreq_driver_fast_switch(policy, next_freq); + if (next_freq == CPUFREQ_ENTRY_INVALID) + return; + + policy->cur = next_freq; + trace_cpu_frequency(next_freq, smp_processor_id()); + } else { + sg_policy->work_in_progress = true; + irq_work_queue(&sg_policy->irq_work); + } +} + +/** + * get_next_freq - Compute a new frequency for a given cpufreq policy. + * @sg_policy: schedutil policy object to compute the new frequency for. + * @util: Current CPU utilization. + * @max: CPU capacity. + * + * If the utilization is frequency-invariant, choose the new frequency to be + * proportional to it, that is + * + * next_freq = C * max_freq * util / max + * + * Otherwise, approximate the would-be frequency-invariant utilization by + * util_raw * (curr_freq / max_freq) which leads to + * + * next_freq = C * curr_freq * util_raw / max + * + * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. + * + * The lowest driver-supported frequency which is equal or greater than the raw + * next_freq (as calculated above) is returned, subject to policy min/max and + * cpufreq driver limitations. + */ +static unsigned int get_next_freq(struct sugov_policy *sg_policy, + unsigned long util, unsigned long max) +{ + struct cpufreq_policy *policy = sg_policy->policy; + unsigned int freq = arch_scale_freq_invariant() ? + policy->cpuinfo.max_freq : policy->cur; + + freq = (freq + (freq >> 2)) * util / max; + + if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX) + return sg_policy->next_freq; + sg_policy->cached_raw_freq = freq; + return cpufreq_driver_resolve_freq(policy, freq); +} + +static inline bool use_pelt(void) +{ +#ifdef CONFIG_SCHED_WALT + return (!sysctl_sched_use_walt_cpu_util || walt_disabled); +#else + return true; +#endif +} + +static void sugov_get_util(unsigned long *util, unsigned long *max, u64 time) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + unsigned long max_cap, rt; + s64 delta; + + max_cap = arch_scale_cpu_capacity(NULL, cpu); + + sched_avg_update(rq); + delta = time - rq->age_stamp; + if (unlikely(delta < 0)) + delta = 0; + rt = div64_u64(rq->rt_avg, sched_avg_period() + delta); + rt = (rt * max_cap) >> SCHED_CAPACITY_SHIFT; + + *util = boosted_cpu_util(cpu); + if (likely(use_pelt())) + *util = *util + rt; + + *util = min(*util, max_cap); + *max = max_cap; +} + +static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, + unsigned int flags) +{ + if (flags & SCHED_CPUFREQ_IOWAIT) { + if (sg_cpu->iowait_boost_pending) + return; + + sg_cpu->iowait_boost_pending = true; + + if (sg_cpu->iowait_boost) { + sg_cpu->iowait_boost <<= 1; + if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) + sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + } else { + sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; + } + } else if (sg_cpu->iowait_boost) { + s64 delta_ns = time - sg_cpu->last_update; + + /* Clear iowait_boost if the CPU apprears to have been idle. */ + if (delta_ns > TICK_NSEC) { + sg_cpu->iowait_boost = 0; + sg_cpu->iowait_boost_pending = false; + } + } +} + +static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, + unsigned long *max) +{ + unsigned int boost_util, boost_max; + + if (!sg_cpu->iowait_boost) + return; + + if (sg_cpu->iowait_boost_pending) { + sg_cpu->iowait_boost_pending = false; + } else { + sg_cpu->iowait_boost >>= 1; + if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { + sg_cpu->iowait_boost = 0; + return; + } + } + + boost_util = sg_cpu->iowait_boost; + boost_max = sg_cpu->iowait_boost_max; + + if (*util * boost_max < *max * boost_util) { + *util = boost_util; + *max = boost_max; + } +} + +#ifdef CONFIG_NO_HZ_COMMON +static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) +{ + unsigned long idle_calls = tick_nohz_get_idle_calls(); + bool ret = idle_calls == sg_cpu->saved_idle_calls; + + sg_cpu->saved_idle_calls = idle_calls; + return ret; +} +#else +static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } +#endif /* CONFIG_NO_HZ_COMMON */ + +static void sugov_update_single(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + struct cpufreq_policy *policy = sg_policy->policy; + unsigned long util, max; + unsigned int next_f; + bool busy; + + sugov_set_iowait_boost(sg_cpu, time, flags); + sg_cpu->last_update = time; + + if (!sugov_should_update_freq(sg_policy, time)) + return; + + busy = sugov_cpu_is_busy(sg_cpu); + + if (flags & SCHED_CPUFREQ_DL) { + next_f = policy->cpuinfo.max_freq; + } else { + sugov_get_util(&util, &max, time); + sugov_iowait_boost(sg_cpu, &util, &max); + next_f = get_next_freq(sg_policy, util, max); + /* + * Do not reduce the frequency if the CPU has not been idle + * recently, as the reduction is likely to be premature then. + */ + if (busy && next_f < sg_policy->next_freq) { + next_f = sg_policy->next_freq; + + /* Reset cached freq as next_freq has changed */ + sg_policy->cached_raw_freq = 0; + } + } + sugov_update_commit(sg_policy, time, next_f); +} + +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) +{ + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + struct cpufreq_policy *policy = sg_policy->policy; + unsigned long util = 0, max = 1; + unsigned int j; + + for_each_cpu(j, policy->cpus) { + struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); + unsigned long j_util, j_max; + s64 delta_ns; + + /* + * If the CPU utilization was last updated before the previous + * frequency update and the time elapsed between the last update + * of the CPU utilization and the last frequency update is long + * enough, don't take the CPU into account as it probably is + * idle now (and clear iowait_boost for it). + */ + delta_ns = time - j_sg_cpu->last_update; + if (delta_ns > TICK_NSEC) { + j_sg_cpu->iowait_boost = 0; + j_sg_cpu->iowait_boost_pending = false; + continue; + } + if (j_sg_cpu->flags & SCHED_CPUFREQ_DL) + return policy->cpuinfo.max_freq; + + j_util = j_sg_cpu->util; + j_max = j_sg_cpu->max; + if (j_util * max > j_max * util) { + util = j_util; + max = j_max; + } + + sugov_iowait_boost(j_sg_cpu, &util, &max); + } + + return get_next_freq(sg_policy, util, max); +} + +static void sugov_update_shared(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + unsigned long util, max; + unsigned int next_f; + + sugov_get_util(&util, &max, time); + + raw_spin_lock(&sg_policy->update_lock); + + sg_cpu->util = util; + sg_cpu->max = max; + sg_cpu->flags = flags; + + sugov_set_iowait_boost(sg_cpu, time, flags); + sg_cpu->last_update = time; + + if (sugov_should_update_freq(sg_policy, time)) { + if (flags & SCHED_CPUFREQ_DL) + next_f = sg_policy->policy->cpuinfo.max_freq; + else + next_f = sugov_next_freq_shared(sg_cpu, time); + + sugov_update_commit(sg_policy, time, next_f); + } + + raw_spin_unlock(&sg_policy->update_lock); +} + +static void sugov_work(struct kthread_work *work) +{ + struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); + + mutex_lock(&sg_policy->work_lock); + __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq, + CPUFREQ_RELATION_L); + mutex_unlock(&sg_policy->work_lock); + + sg_policy->work_in_progress = false; +} + +static void sugov_irq_work(struct irq_work *irq_work) +{ + struct sugov_policy *sg_policy; + + sg_policy = container_of(irq_work, struct sugov_policy, irq_work); + + /* + * For RT and deadline tasks, the schedutil governor shoots the + * frequency to maximum. Special care must be taken to ensure that this + * kthread doesn't result in the same behavior. + * + * This is (mostly) guaranteed by the work_in_progress flag. The flag is + * updated only at the end of the sugov_work() function and before that + * the schedutil governor rejects all other frequency scaling requests. + * + * There is a very rare case though, where the RT thread yields right + * after the work_in_progress flag is cleared. The effects of that are + * neglected for now. + */ + queue_kthread_work(&sg_policy->worker, &sg_policy->work); +} + +/************************** sysfs interface ************************/ + +static struct sugov_tunables *global_tunables; +static DEFINE_MUTEX(global_tunables_lock); + +static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set) +{ + return container_of(attr_set, struct sugov_tunables, attr_set); +} + +static DEFINE_MUTEX(min_rate_lock); + +static void update_min_rate_limit_us(struct sugov_policy *sg_policy) +{ + mutex_lock(&min_rate_lock); + sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns, + sg_policy->down_rate_delay_ns); + mutex_unlock(&min_rate_lock); +} + +static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + + return sprintf(buf, "%u\n", tunables->up_rate_limit_us); +} + +static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + + return sprintf(buf, "%u\n", tunables->down_rate_limit_us); +} + +static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + struct sugov_policy *sg_policy; + unsigned int rate_limit_us; + + if (kstrtouint(buf, 10, &rate_limit_us)) + return -EINVAL; + + tunables->up_rate_limit_us = rate_limit_us; + + list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) { + sg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC; + update_min_rate_limit_us(sg_policy); + } + + return count; +} + +static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + struct sugov_policy *sg_policy; + unsigned int rate_limit_us; + + if (kstrtouint(buf, 10, &rate_limit_us)) + return -EINVAL; + + tunables->down_rate_limit_us = rate_limit_us; + + list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) { + sg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC; + update_min_rate_limit_us(sg_policy); + } + + return count; +} + +static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us); +static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us); + +static struct attribute *sugov_attributes[] = { + &up_rate_limit_us.attr, + &down_rate_limit_us.attr, + NULL +}; + +static struct kobj_type sugov_tunables_ktype = { + .default_attrs = sugov_attributes, + .sysfs_ops = &governor_sysfs_ops, +}; + +/********************** cpufreq governor interface *********************/ +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL +static +#endif +struct cpufreq_governor cpufreq_gov_schedutil; + +static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy; + + sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL); + if (!sg_policy) + return NULL; + + sg_policy->policy = policy; + raw_spin_lock_init(&sg_policy->update_lock); + return sg_policy; +} + +static void sugov_policy_free(struct sugov_policy *sg_policy) +{ + kfree(sg_policy); +} + +static int sugov_kthread_create(struct sugov_policy *sg_policy) +{ + struct task_struct *thread; + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; + struct cpufreq_policy *policy = sg_policy->policy; + int ret; + + /* kthread only required for slow path */ + if (policy->fast_switch_enabled) + return 0; + + init_kthread_work(&sg_policy->work, sugov_work); + init_kthread_worker(&sg_policy->worker); + thread = kthread_create(kthread_worker_fn, &sg_policy->worker, + "sugov:%d", + cpumask_first(policy->related_cpus)); + if (IS_ERR(thread)) { + pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); + return PTR_ERR(thread); + } + + ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + return ret; + } + + sg_policy->thread = thread; + kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); + mutex_init(&sg_policy->work_lock); + + wake_up_process(thread); + + return 0; +} + +static void sugov_kthread_stop(struct sugov_policy *sg_policy) +{ + /* kthread only required for slow path */ + if (sg_policy->policy->fast_switch_enabled) + return; + + flush_kthread_worker(&sg_policy->worker); + kthread_stop(sg_policy->thread); + mutex_destroy(&sg_policy->work_lock); +} + +static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) +{ + struct sugov_tunables *tunables; + + tunables = kzalloc(sizeof(*tunables), GFP_KERNEL); + if (tunables) { + gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook); + if (!have_governor_per_policy()) + global_tunables = tunables; + } + return tunables; +} + +static void sugov_tunables_free(struct sugov_tunables *tunables) +{ + if (!have_governor_per_policy()) + global_tunables = NULL; + + kfree(tunables); +} + +static int sugov_init(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy; + struct sugov_tunables *tunables; + int ret = 0; + + /* State should be equivalent to EXIT */ + if (policy->governor_data) + return -EBUSY; + + cpufreq_enable_fast_switch(policy); + + sg_policy = sugov_policy_alloc(policy); + if (!sg_policy) { + ret = -ENOMEM; + goto disable_fast_switch; + } + + ret = sugov_kthread_create(sg_policy); + if (ret) + goto free_sg_policy; + + mutex_lock(&global_tunables_lock); + + if (global_tunables) { + if (WARN_ON(have_governor_per_policy())) { + ret = -EINVAL; + goto stop_kthread; + } + policy->governor_data = sg_policy; + sg_policy->tunables = global_tunables; + + gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook); + goto out; + } + + tunables = sugov_tunables_alloc(sg_policy); + if (!tunables) { + ret = -ENOMEM; + goto stop_kthread; + } + + if (policy->up_transition_delay_us && policy->down_transition_delay_us) { + tunables->up_rate_limit_us = policy->up_transition_delay_us; + tunables->down_rate_limit_us = policy->down_transition_delay_us; + } else { + unsigned int lat; + + tunables->up_rate_limit_us = LATENCY_MULTIPLIER; + tunables->down_rate_limit_us = LATENCY_MULTIPLIER; + lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; + if (lat) { + tunables->up_rate_limit_us *= lat; + tunables->down_rate_limit_us *= lat; + } + } + + policy->governor_data = sg_policy; + sg_policy->tunables = tunables; + + ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype, + get_governor_parent_kobj(policy), "%s", + cpufreq_gov_schedutil.name); + if (ret) + goto fail; + +out: + mutex_unlock(&global_tunables_lock); + return 0; + +fail: + policy->governor_data = NULL; + sugov_tunables_free(tunables); + +stop_kthread: + sugov_kthread_stop(sg_policy); + +free_sg_policy: + mutex_unlock(&global_tunables_lock); + + sugov_policy_free(sg_policy); + +disable_fast_switch: + cpufreq_disable_fast_switch(policy); + + pr_err("initialization failed (error %d)\n", ret); + return ret; +} + +static int sugov_exit(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + struct sugov_tunables *tunables = sg_policy->tunables; + unsigned int count; + + mutex_lock(&global_tunables_lock); + + count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); + policy->governor_data = NULL; + if (!count) + sugov_tunables_free(tunables); + + mutex_unlock(&global_tunables_lock); + + sugov_kthread_stop(sg_policy); + sugov_policy_free(sg_policy); + + cpufreq_disable_fast_switch(policy); + return 0; +} + +static int sugov_start(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + unsigned int cpu; + + sg_policy->up_rate_delay_ns = + sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC; + sg_policy->down_rate_delay_ns = + sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC; + update_min_rate_limit_us(sg_policy); + sg_policy->last_freq_update_time = 0; + sg_policy->next_freq = UINT_MAX; + sg_policy->work_in_progress = false; + sg_policy->need_freq_update = false; + sg_policy->cached_raw_freq = 0; + + for_each_cpu(cpu, policy->cpus) { + struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); + + memset(sg_cpu, 0, sizeof(*sg_cpu)); + sg_cpu->sg_policy = sg_policy; + sg_cpu->flags = SCHED_CPUFREQ_DL; + sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, + policy_is_shared(policy) ? + sugov_update_shared : + sugov_update_single); + } + return 0; +} + +static int sugov_stop(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + unsigned int cpu; + + for_each_cpu(cpu, policy->cpus) + cpufreq_remove_update_util_hook(cpu); + + synchronize_sched(); + + if (!policy->fast_switch_enabled) { + irq_work_sync(&sg_policy->irq_work); + kthread_cancel_work_sync(&sg_policy->work); + } + return 0; +} + +static int sugov_limits(struct cpufreq_policy *policy) +{ + struct sugov_policy *sg_policy = policy->governor_data; + + if (!policy->fast_switch_enabled) { + mutex_lock(&sg_policy->work_lock); + cpufreq_policy_apply_limits(policy); + mutex_unlock(&sg_policy->work_lock); + } + + sg_policy->need_freq_update = true; + + return 0; +} + +static int cpufreq_schedutil_cb(struct cpufreq_policy *policy, + unsigned int event) +{ + switch(event) { + case CPUFREQ_GOV_POLICY_INIT: + return sugov_init(policy); + case CPUFREQ_GOV_POLICY_EXIT: + return sugov_exit(policy); + case CPUFREQ_GOV_START: + return sugov_start(policy); + case CPUFREQ_GOV_STOP: + return sugov_stop(policy); + case CPUFREQ_GOV_LIMITS: + return sugov_limits(policy); + default: + BUG(); + } +} + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL +static +#endif +struct cpufreq_governor cpufreq_gov_schedutil = { + .name = "schedutil", + .governor = cpufreq_schedutil_cb, + .owner = THIS_MODULE, +}; + +static int __init sugov_register(void) +{ + return cpufreq_register_governor(&cpufreq_gov_schedutil); +} +fs_initcall(sugov_register); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 981fcd7dc394..14225d5d8617 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -27,6 +27,8 @@ * of the License. */ +#include "sched.h" + #include <linux/gfp.h> #include <linux/sched.h> #include <linux/sched/rt.h> @@ -51,6 +53,27 @@ static int convert_prio(int prio) } /** + * drop_nopreempt_cpus - remove a cpu from the mask if it is likely + * non-preemptible + * @lowest_mask: mask with selected CPUs (non-NULL) + */ +static void +drop_nopreempt_cpus(struct cpumask *lowest_mask) +{ + unsigned int cpu = cpumask_first(lowest_mask); + + while (cpu < nr_cpu_ids) { + /* unlocked access */ + struct task_struct *task = READ_ONCE(cpu_rq(cpu)->curr); + + if (task_may_not_preempt(task, cpu)) + cpumask_clear_cpu(cpu, lowest_mask); + + cpu = cpumask_next(cpu, lowest_mask); + } +} + +/** * cpupri_find - find the best (lowest-pri) CPU in the system * @cp: The cpupri context * @p: The task @@ -70,9 +93,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, { int idx = 0; int task_pri = convert_prio(p->prio); + bool drop_nopreempts = task_pri <= MAX_RT_PRIO; BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); +retry: for (idx = 0; idx < task_pri; idx++) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; int skip = 0; @@ -108,7 +133,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (lowest_mask) { cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); - + if (drop_nopreempts) + drop_nopreempt_cpus(lowest_mask); /* * We have to ensure that we have at least one bit * still set in the array, since the map could have @@ -123,7 +149,14 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, return 1; } - + /* + * If we can't find any non-preemptible cpu's, retry so we can + * find the lowest priority target and avoid priority inversion. + */ + if (drop_nopreempts) { + drop_nopreempts = false; + goto retry; + } return 0; } @@ -246,3 +279,14 @@ void cpupri_cleanup(struct cpupri *cp) for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) free_cpumask_var(cp->pri_to_cpu[i].mask); } + +/* + * cpupri_check_rt - check if CPU has a RT task + * should be called from rcu-sched read section. + */ +bool cpupri_check_rt(void) +{ + int cpu = raw_smp_processor_id(); + + return cpu_rq(cpu)->rd->cpupri.cpu_to_pri[cpu] > CPUPRI_NORMAL; +} diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3d55ec89c400..188c8388a63f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -18,6 +18,8 @@ #include <linux/slab.h> +#include "walt.h" + struct dl_bandwidth def_dl_bandwidth; static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) @@ -461,13 +463,13 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * * This function returns true if: * - * runtime / (deadline - t) > dl_runtime / dl_period , + * runtime / (deadline - t) > dl_runtime / dl_deadline , * * IOW we can't recycle current parameters. * - * Notice that the bandwidth check is done against the period. For + * Notice that the bandwidth check is done against the deadline. For * task with deadline equal to period this is the same of using - * dl_deadline instead of dl_period in the equation above. + * dl_period instead of dl_deadline in the equation above. */ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, struct sched_dl_entity *pi_se, u64 t) @@ -492,7 +494,7 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, * of anything below microseconds resolution is actually fiction * (but still we want to give the user that illusion >;). */ - left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); + left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); right = ((dl_se->deadline - t) >> DL_SCALE) * (pi_se->dl_runtime >> DL_SCALE); @@ -500,13 +502,84 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, } /* - * When a -deadline entity is queued back on the runqueue, its runtime and - * deadline might need updating. + * Revised wakeup rule [1]: For self-suspending tasks, rather then + * re-initializing task's runtime and deadline, the revised wakeup + * rule adjusts the task's runtime to avoid the task to overrun its + * density. + * + * Reasoning: a task may overrun the density if: + * runtime / (deadline - t) > dl_runtime / dl_deadline + * + * Therefore, runtime can be adjusted to: + * runtime = (dl_runtime / dl_deadline) * (deadline - t) + * + * In such way that runtime will be equal to the maximum density + * the task can use without breaking any rule. + * + * [1] Luca Abeni, Giuseppe Lipari, and Juri Lelli. 2015. Constant + * bandwidth server revisited. SIGBED Rev. 11, 4 (January 2015), 19-24. + */ +static void +update_dl_revised_wakeup(struct sched_dl_entity *dl_se, struct rq *rq) +{ + u64 laxity = dl_se->deadline - rq_clock(rq); + + /* + * If the task has deadline < period, and the deadline is in the past, + * it should already be throttled before this check. + * + * See update_dl_entity() comments for further details. + */ + WARN_ON(dl_time_before(dl_se->deadline, rq_clock(rq))); + + dl_se->runtime = (dl_se->dl_density * laxity) >> 20; +} + +/* + * Regarding the deadline, a task with implicit deadline has a relative + * deadline == relative period. A task with constrained deadline has a + * relative deadline <= relative period. + * + * We support constrained deadline tasks. However, there are some restrictions + * applied only for tasks which do not have an implicit deadline. See + * update_dl_entity() to know more about such restrictions. * - * The policy here is that we update the deadline of the entity only if: - * - the current deadline is in the past, - * - using the remaining runtime with the current deadline would make - * the entity exceed its bandwidth. + * The dl_is_implicit() returns true if the task has an implicit deadline. + */ +static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_deadline == dl_se->dl_period; +} + +/* + * When a deadline entity is placed in the runqueue, its runtime and deadline + * might need to be updated. This is done by a CBS wake up rule. There are two + * different rules: 1) the original CBS; and 2) the Revisited CBS. + * + * When the task is starting a new period, the Original CBS is used. In this + * case, the runtime is replenished and a new absolute deadline is set. + * + * When a task is queued before the begin of the next period, using the + * remaining runtime and deadline could make the entity to overflow, see + * dl_entity_overflow() to find more about runtime overflow. When such case + * is detected, the runtime and deadline need to be updated. + * + * If the task has an implicit deadline, i.e., deadline == period, the Original + * CBS is applied. the runtime is replenished and a new absolute deadline is + * set, as in the previous cases. + * + * However, the Original CBS does not work properly for tasks with + * deadline < period, which are said to have a constrained deadline. By + * applying the Original CBS, a constrained deadline task would be able to run + * runtime/deadline in a period. With deadline < period, the task would + * overrun the runtime/period allowed bandwidth, breaking the admission test. + * + * In order to prevent this misbehave, the Revisited CBS is used for + * constrained deadline tasks when a runtime overflow is detected. In the + * Revisited CBS, rather than replenishing & setting a new absolute deadline, + * the remaining runtime of the task is reduced to avoid runtime overflow. + * Please refer to the comments update_dl_revised_wakeup() function to find + * more about the Revised CBS rule. */ static void update_dl_entity(struct sched_dl_entity *dl_se, struct sched_dl_entity *pi_se) @@ -528,15 +601,28 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + + if (unlikely(!dl_is_implicit(dl_se) && + !dl_time_before(dl_se->deadline, rq_clock(rq)) && + !dl_se->dl_boosted)){ + update_dl_revised_wakeup(dl_se, rq); + return; + } + dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } } +static inline u64 dl_next_period(struct sched_dl_entity *dl_se) +{ + return dl_se->deadline - dl_se->dl_deadline + dl_se->dl_period; +} + /* * If the entity depleted all its runtime, and if we want it to sleep * while waiting for some new execution time to become available, we - * set the bandwidth enforcement timer to the replenishment instant + * set the bandwidth replenishment timer to the replenishment instant * and try to activate it. * * Notice that it is important for the caller to know if the timer @@ -558,7 +644,7 @@ static int start_dl_timer(struct task_struct *p) * that it is actually coming from rq->clock and not from * hrtimer's time base reading. */ - act = ns_to_ktime(dl_se->deadline); + act = ns_to_ktime(dl_next_period(dl_se)); now = hrtimer_cb_get_time(timer); delta = ktime_to_ns(now) - rq_clock(rq); act = ktime_add_ns(act, delta); @@ -722,6 +808,39 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) timer->function = dl_task_timer; } +/* + * During the activation, CBS checks if it can reuse the current task's + * runtime and period. If the deadline of the task is in the past, CBS + * cannot use the runtime, and so it replenishes the task. This rule + * works fine for implicit deadline tasks (deadline == period), and the + * CBS was designed for implicit deadline tasks. However, a task with + * constrained deadline (deadine < period) might be awakened after the + * deadline, but before the next period. In this case, replenishing the + * task would allow it to run for runtime / deadline. As in this case + * deadline < period, CBS enables a task to run for more than the + * runtime / period. In a very loaded system, this can cause a domino + * effect, making other tasks miss their deadlines. + * + * To avoid this problem, in the activation of a constrained deadline + * task after the deadline but before the next period, throttle the + * task and set the replenishing timer to the begin of the next period, + * unless it is boosted. + */ +static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) +{ + struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se)); + + if (dl_time_before(dl_se->deadline, rq_clock(rq)) && + dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { + if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) + return; + dl_se->dl_throttled = 1; + if (dl_se->runtime > 0) + dl_se->runtime = 0; + } +} + static int dl_runtime_exceeded(struct sched_dl_entity *dl_se) { @@ -755,6 +874,9 @@ static void update_curr_dl(struct rq *rq) if (unlikely((s64)delta_exec <= 0)) return; + /* kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -1037,6 +1159,15 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) } /* + * Check if a constrained deadline task was activated + * after the deadline but before the next period. + * If that is the case, the task will be throttled and + * the replenishment timer will be set to the next period. + */ + if (!p->dl.dl_throttled && !dl_is_implicit(&p->dl)) + dl_check_constrained_dl(&p->dl); + + /* * If p is throttled, we do nothing. In fact, if it exhausted * its budget it needs a replenishment and, since it now is on * its rq, the bandwidth timer callback (which clearly has not @@ -1102,7 +1233,8 @@ static void yield_task_dl(struct rq *rq) static int find_later_rq(struct task_struct *task); static int -select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { struct task_struct *curr; struct rq *rq; @@ -1620,7 +1752,9 @@ retry: next_task->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(rq, next_task, 0); clear_average_bw(&next_task->dl, &rq->dl); + next_task->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(next_task, later_rq->cpu); + next_task->on_rq = TASK_ON_RQ_QUEUED; add_average_bw(&next_task->dl, &later_rq->dl); activate_task(later_rq, next_task, 0); next_task->on_rq = TASK_ON_RQ_QUEUED; @@ -1712,7 +1846,9 @@ static void pull_dl_task(struct rq *this_rq) p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); clear_average_bw(&p->dl, &src_rq->dl); + p->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(p, this_cpu); + p->on_rq = TASK_ON_RQ_QUEUED; add_average_bw(&p->dl, &this_rq->dl); activate_task(this_rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c8c4272c61d8..ed8e6bb4531b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -636,6 +636,32 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.statistics.nr_wakeups_affine_attempts); P(se.statistics.nr_wakeups_passive); P(se.statistics.nr_wakeups_idle); + /* eas */ + /* select_idle_sibling() */ + P(se.statistics.nr_wakeups_sis_attempts); + P(se.statistics.nr_wakeups_sis_idle); + P(se.statistics.nr_wakeups_sis_cache_affine); + P(se.statistics.nr_wakeups_sis_suff_cap); + P(se.statistics.nr_wakeups_sis_idle_cpu); + P(se.statistics.nr_wakeups_sis_count); + /* select_energy_cpu_brute() */ + P(se.statistics.nr_wakeups_secb_attempts); + P(se.statistics.nr_wakeups_secb_sync); + P(se.statistics.nr_wakeups_secb_idle_bt); + P(se.statistics.nr_wakeups_secb_insuff_cap); + P(se.statistics.nr_wakeups_secb_no_nrg_sav); + P(se.statistics.nr_wakeups_secb_nrg_sav); + P(se.statistics.nr_wakeups_secb_count); + /* find_best_target() */ + P(se.statistics.nr_wakeups_fbt_attempts); + P(se.statistics.nr_wakeups_fbt_no_cpu); + P(se.statistics.nr_wakeups_fbt_no_sd); + P(se.statistics.nr_wakeups_fbt_pref_idle); + P(se.statistics.nr_wakeups_fbt_count); + /* cas */ + /* select_task_rq_fair() */ + P(se.statistics.nr_wakeups_cas_attempts); + P(se.statistics.nr_wakeups_cas_count); #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) __P(load_avg); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 099a1b93bebf..23e37b0674df 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -35,6 +35,8 @@ #include "sched.h" #include <trace/events/sched.h> #include "tune.h" +#include "walt.h" + /* * Targeted preemption latency for CPU-bound tasks: * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) @@ -50,9 +52,7 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; -unsigned int sysctl_sched_is_big_little = 0; unsigned int sysctl_sched_sync_hint_enable = 1; -unsigned int sysctl_sched_initial_task_util = 0; unsigned int sysctl_sched_cstate_aware = 1; /* @@ -119,6 +119,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +/* + * The margin used when comparing utilization with CPU capacity: + * util * margin < capacity * 1024 + */ +unsigned int capacity_margin = 1280; /* ~20% */ + static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; @@ -294,19 +300,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { if (!cfs_rq->on_list) { + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Ensure we either appear before our parent (if already * enqueued) or force our parent to appear after us when it is - * enqueued. The fact that we always enqueue bottom-up - * reduces this to two cases. + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases and a special case for the root + * cfs_rq. Furthermore, it also means that we will always reset + * tmp_alone_branch either when the branch is connected + * to a tree or when we reach the beg of the tree */ if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); - } else { + cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { + /* + * If parent is already on the list, we add the child + * just before. Thanks to circular linked property of + * the list, this means to put the child at the tail + * of the list that starts by parent. + */ + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + /* + * The branch is now connected to its tree so we can + * reset tmp_alone_branch to the beginning of the + * list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else if (!cfs_rq->tg->parent) { + /* + * cfs rq without parent should be put + * at the tail of the list. + */ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &rq_of(cfs_rq)->leaf_cfs_rq_list); + &rq->leaf_cfs_rq_list); + /* + * We have reach the beg of a tree so we can reset + * tmp_alone_branch to the beginning of the list. + */ + rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; + } else { + /* + * The parent has not already been added so we want to + * make sure that it will be put after us. + * tmp_alone_branch points to the beg of the branch + * where we will add parent. + */ + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + rq->tmp_alone_branch); + /* + * update tmp_alone_branch to points to the new beg + * of the branch + */ + rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; } cfs_rq->on_list = 1; @@ -664,7 +710,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP -static int select_idle_sibling(struct task_struct *p, int cpu); +static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); /* @@ -688,20 +734,115 @@ void init_entity_runnable_average(struct sched_entity *se) * will definitely be update (after enqueue). */ sa->period_contrib = 1023; - sa->load_avg = scale_load_down(se->load.weight); + /* + * Tasks are intialized with full load to be seen as heavy tasks until + * they get a chance to stabilize to their real load level. + * Group entities are intialized with zero load to reflect the fact that + * nothing has been attached to the task group yet. + */ + if (entity_is_task(se)) + sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - sa->util_avg = sched_freq() ? - sysctl_sched_initial_task_util : - scale_load_down(SCHED_LOAD_SCALE); - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + /* + * In previous Android versions, we used to have: + * sa->util_avg = scale_load_down(SCHED_LOAD_SCALE); + * sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + * However, that functionality has been moved to enqueue. + * It is unclear if we should restore this in enqueue. + */ + /* + * At this point, util_avg won't be used in select_task_rq_fair anyway + */ + sa->util_avg = 0; + sa->util_sum = 0; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -#else +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); +static void attach_entity_cfs_rq(struct sched_entity *se); +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); + +/* + * With new tasks being created, their initial util_avgs are extrapolated + * based on the cfs_rq's current util_avg: + * + * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight + * + * However, in many cases, the above util_avg does not give a desired + * value. Moreover, the sum of the util_avgs may be divergent, such + * as when the series is a harmonic series. + * + * To solve this problem, we also cap the util_avg of successive tasks to + * only 1/2 of the left utilization budget: + * + * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n + * + * where n denotes the nth task. + * + * For example, a simplest series from the beginning would be like: + * + * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... + * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... + * + * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) + * if util_avg > util_avg_cap. + */ +void post_init_entity_util_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct sched_avg *sa = &se->avg; + long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + /* + * If we wish to restore tuning via setting initial util, + * this is where we should do it. + */ + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + } + + if (entity_is_task(se)) { + struct task_struct *p = task_of(se); + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); + return; + } + } + + attach_entity_cfs_rq(se); +} + +#else /* !CONFIG_SMP */ void init_entity_runnable_average(struct sched_entity *se) { } -#endif +void post_init_entity_util_avg(struct sched_entity *se) +{ +} +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +{ +} +#endif /* CONFIG_SMP */ /* * Update the current task's runtime statistics. @@ -1425,7 +1566,8 @@ balance: * Call select_idle_sibling to maybe find a better one. */ if (!cur) - env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); + env->dst_cpu = select_idle_sibling(env->p, env->src_cpu, + env->dst_cpu); assign: assigned = true; @@ -2410,28 +2552,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_FAIR_GROUP_SCHED # ifdef CONFIG_SMP -static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq) +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) { - long tg_weight; + long tg_weight, load, shares; /* - * Use this CPU's real-time load instead of the last load contribution - * as the updating of the contribution is delayed, and we will use the - * the real-time load to calc the share. See update_tg_load_avg(). + * This really should be: cfs_rq->avg.load_avg, but instead we use + * cfs_rq->load.weight, which is its upper bound. This helps ramp up + * the shares for small weight interactive tasks. */ - tg_weight = atomic_long_read(&tg->load_avg); - tg_weight -= cfs_rq->tg_load_avg_contrib; - tg_weight += cfs_rq->load.weight; - - return tg_weight; -} + load = scale_load_down(cfs_rq->load.weight); -static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) -{ - long tg_weight, load, shares; + tg_weight = atomic_long_read(&tg->load_avg); - tg_weight = calc_tg_weight(tg, cfs_rq); - load = cfs_rq->load.weight; + /* Ensure tg_weight >= load */ + tg_weight -= cfs_rq->tg_load_avg_contrib; + tg_weight += load; shares = (tg->shares * load); if (tg_weight) @@ -2450,6 +2586,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) return tg->shares; } # endif /* CONFIG_SMP */ + static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { @@ -2468,16 +2605,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -static void update_cfs_shares(struct cfs_rq *cfs_rq) +static void update_cfs_shares(struct sched_entity *se) { + struct cfs_rq *cfs_rq = group_cfs_rq(se); struct task_group *tg; - struct sched_entity *se; long shares; - tg = cfs_rq->tg; - se = tg->se[cpu_of(rq_of(cfs_rq))]; - if (!se || throttled_hierarchy(cfs_rq)) + if (!cfs_rq) + return; + + if (throttled_hierarchy(cfs_rq)) return; + + tg = cfs_rq->tg; + #ifndef CONFIG_SMP if (likely(se->load.weight == tg->shares)) return; @@ -2486,8 +2627,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } + #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +static inline void update_cfs_shares(struct sched_entity *se) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -2499,6 +2641,7 @@ u32 sched_get_wake_up_idle(struct task_struct *p) return !!enabled; } +EXPORT_SYMBOL(sched_get_wake_up_idle); int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle) { @@ -2511,6 +2654,7 @@ int sched_set_wake_up_idle(struct task_struct *p, int wake_up_idle) return 0; } +EXPORT_SYMBOL(sched_set_wake_up_idle); static const u32 runnable_avg_yN_inv[] = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, @@ -3790,25 +3934,262 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, return decayed; } -#ifdef CONFIG_FAIR_GROUP_SCHED /* - * Updating tg's load_avg is necessary before update_cfs_share (which is done) - * and effective_load (which is not done because it is too costly). + * Signed add and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define add_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(_val) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + \ + res = var + val; \ + \ + if (val < 0 && res > var) \ + res = 0; \ + \ + WRITE_ONCE(*ptr, res); \ +} while (0) + +#ifdef CONFIG_FAIR_GROUP_SCHED +/** + * update_tg_load_avg - update the tg's load avg + * @cfs_rq: the cfs_rq whose avg changed + * @force: update regardless of how small the difference + * + * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. + * However, because tg->load_avg is a global value there are performance + * considerations. + * + * In order to avoid having to look at the other cfs_rq's, we use a + * differential update where we store the last value we propagated. This in + * turn allows skipping updates if the differential is 'small'. + * + * Updating tg's load_avg is necessary before update_cfs_share() (which is + * done) and effective_load() (which is not done because it is too costly). */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; + /* + * No need to update load_avg for root_task_group as it is not used. + */ + if (cfs_rq->tg == &root_task_group) + return; + if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } } +/* + * Called within set_task_rq() right before setting a task's cpu. The + * caller only guarantees p->pi_lock is held; no other assumptions, + * including the state of rq->lock, should be made. + */ +void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) +{ + if (!sched_feat(ATTACH_AGE_LOAD)) + return; + + /* + * We are supposed to update the task to "current" time, then its up to + * date and ready to go to new CPU/cfs_rq. But we have difficulty in + * getting what current time is, so simply throw away the out-of-date + * time. This will result in the wakee task is less decayed, but giving + * the wakee more load sounds not bad. + */ + if (se->avg.last_update_time && prev) { + u64 p_last_update_time; + u64 n_last_update_time; + +#ifndef CONFIG_64BIT + u64 p_last_update_time_copy; + u64 n_last_update_time_copy; + + do { + p_last_update_time_copy = prev->load_last_update_time_copy; + n_last_update_time_copy = next->load_last_update_time_copy; + + smp_rmb(); + + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; + + } while (p_last_update_time != p_last_update_time_copy || + n_last_update_time != n_last_update_time_copy); +#else + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; +#endif + __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), + &se->avg, 0, 0, NULL); + se->avg.last_update_time = n_last_update_time; + } +} + +/* Take into account change of utilization of a child task group */ +static inline void +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's utilization */ + se->avg.util_avg = gcfs_rq->avg.util_avg; + se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq utilization */ + add_positive(&cfs_rq->avg.util_avg, delta); + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; +} + +/* Take into account change of load of a child task group */ +static inline void +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + long delta, load = gcfs_rq->avg.load_avg; + + /* + * If the load of group cfs_rq is null, the load of the + * sched_entity will also be null so we can skip the formula + */ + if (load) { + long tg_load; + + /* Get tg's load and ensure tg_load > 0 */ + tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1; + + /* Ensure tg_load >= load and updated with current load*/ + tg_load -= gcfs_rq->tg_load_avg_contrib; + tg_load += load; + + /* + * We need to compute a correction term in the case that the + * task group is consuming more CPU than a task of equal + * weight. A task with a weight equals to tg->shares will have + * a load less or equal to scale_load_down(tg->shares). + * Similarly, the sched_entities that represent the task group + * at parent level, can't have a load higher than + * scale_load_down(tg->shares). And the Sum of sched_entities' + * load must be <= scale_load_down(tg->shares). + */ + if (tg_load > scale_load_down(gcfs_rq->tg->shares)) { + /* scale gcfs_rq's load into tg's shares*/ + load *= scale_load_down(gcfs_rq->tg->shares); + load /= tg_load; + } + } + + delta = load - se->avg.load_avg; + + /* Nothing to update */ + if (!delta) + return; + + /* Set new sched_entity's load */ + se->avg.load_avg = load; + se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX; + + /* Update parent cfs_rq load */ + add_positive(&cfs_rq->avg.load_avg, delta); + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX; + + /* + * If the sched_entity is already enqueued, we also have to update the + * runnable load avg. + */ + if (se->on_rq) { + /* Update parent cfs_rq runnable_load_avg */ + add_positive(&cfs_rq->runnable_load_avg, delta); + cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX; + } +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) +{ + cfs_rq->propagate_avg = 1; +} + +static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = group_cfs_rq(se); + + if (!cfs_rq->propagate_avg) + return 0; + + cfs_rq->propagate_avg = 0; + return 1; +} + +/* Update task and its cfs_rq load average */ +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + + if (entity_is_task(se)) + return 0; + + if (!test_and_clear_tg_cfs_propagate(se)) + return 0; + + cfs_rq = cfs_rq_of(se); + + set_tg_cfs_propagate(cfs_rq); + + update_tg_cfs_util(cfs_rq, se); + update_tg_cfs_load(cfs_rq, se); + + return 1; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ + static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} + +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + return 0; +} + +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} + #endif /* CONFIG_FAIR_GROUP_SCHED */ +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) +{ + if (&this_rq()->cfs == cfs_rq) { + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_of(cfs_rq), 0); + } +} + static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); /* @@ -3828,23 +4209,43 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); WRITE_ONCE(*ptr, res); \ } while (0) -/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ -static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) +/** + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages + * @now: current time, as per cfs_rq_clock_task() + * @cfs_rq: cfs_rq to update + * @update_freq: should we call cfs_rq_util_change() or will the call do so + * + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) + * avg. The immediate corollary is that all (fair) tasks must be attached, see + * post_init_entity_util_avg(). + * + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. + * + * Returns true if the load decayed or we removed load. + * + * Since both these conditions indicate a changed cfs_rq->avg.load we should + * call update_tg_load_avg() when this function returns true. + */ +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) { struct sched_avg *sa = &cfs_rq->avg; - int decayed, removed = 0; + int decayed, removed = 0, removed_util = 0; if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); sub_positive(&sa->load_avg, r); sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); removed = 1; + set_tg_cfs_propagate(cfs_rq); } if (atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); sub_positive(&sa->util_avg, r); sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); + removed_util = 1; + set_tg_cfs_propagate(cfs_rq); } decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, @@ -3859,68 +4260,89 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) if (cfs_rq == &rq_of(cfs_rq)->cfs) trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); + if (update_freq && (decayed || removed_util)) + cfs_rq_util_change(cfs_rq); + return decayed || removed; } +/* + * Optional action to be done while updating the load average + */ +#define UPDATE_TG 0x1 +#define SKIP_AGE_LOAD 0x2 + /* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) +static inline void update_load_avg(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); int cpu = cpu_of(rq_of(cfs_rq)); + int decayed; + void *ptr = NULL; /* * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - __update_load_avg(now, cpu, &se->avg, + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { + __update_load_avg(now, cpu, &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); + } + + decayed = update_cfs_rq_load_avg(now, cfs_rq, true); + decayed |= propagate_entity_load_avg(se); - if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) + if (decayed && (flags & UPDATE_TG)) update_tg_load_avg(cfs_rq, 0); - if (entity_is_task(se)) - trace_sched_load_avg_task(task_of(se), &se->avg); + if (entity_is_task(se)) { +#ifdef CONFIG_SCHED_WALT + ptr = (void *)&(task_of(se)->ravg); +#endif + trace_sched_load_avg_task(task_of(se), &se->avg, ptr); + } } +/** + * attach_entity_load_avg - attach this entity to its cfs_rq load avg + * @cfs_rq: cfs_rq to attach to + * @se: sched_entity to attach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (!sched_feat(ATTACH_AGE_LOAD)) - goto skip_aging; - - /* - * If we got migrated (either between CPUs or between cgroups) we'll - * have aged the average right before clearing @last_update_time. - */ - if (se->avg.last_update_time) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, 0, 0, NULL); - - /* - * XXX: we could have just aged the entire load away if we've been - * absent from the fair class for too long. - */ - } - -skip_aging: se->avg.last_update_time = cfs_rq->avg.last_update_time; cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se->avg.load_sum; cfs_rq->avg.util_avg += se->avg.util_avg; cfs_rq->avg.util_sum += se->avg.util_sum; + set_tg_cfs_propagate(cfs_rq); + + cfs_rq_util_change(cfs_rq); } +/** + * detach_entity_load_avg - detach this entity from its cfs_rq load avg + * @cfs_rq: cfs_rq to detach from + * @se: sched_entity to detach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), - &se->avg, se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + set_tg_cfs_propagate(cfs_rq); + + cfs_rq_util_change(cfs_rq); } /* Add the load generated by se into cfs_rq's load average */ @@ -3928,34 +4350,20 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct sched_avg *sa = &se->avg; - u64 now = cfs_rq_clock_task(cfs_rq); - int migrated, decayed; - - migrated = !sa->last_update_time; - if (!migrated) { - __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } - - decayed = update_cfs_rq_load_avg(now, cfs_rq); cfs_rq->runnable_load_avg += sa->load_avg; cfs_rq->runnable_load_sum += sa->load_sum; - if (migrated) + if (!sa->last_update_time) { attach_entity_load_avg(cfs_rq, se); - - if (decayed || migrated) update_tg_load_avg(cfs_rq, 0); + } } /* Remove the runnable load generated by se from cfs_rq's runnable load average */ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { - update_load_avg(se, 1); - cfs_rq->runnable_load_avg = max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); cfs_rq->runnable_load_sum = @@ -3984,24 +4392,37 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) #endif /* + * Synchronize entity load avg of dequeued entity without locking + * the previous rq. + */ +void sync_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + last_update_time = cfs_rq_last_update_time(cfs_rq); + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); +} + +/* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). */ void remove_entity_load_avg(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 last_update_time; /* - * Newly created task or never used group entity should not be removed - * from its (source) cfs_rq + * tasks cannot exit without having gone through wake_up_new_task() -> + * post_init_entity_util_avg() which will have added things to the + * cfs_rq, so we can remove unconditionally. + * + * Similarly for groups, they will have passed through + * post_init_entity_util_avg() before unregister_sched_fair_group() + * calls this. */ - if (se->avg.last_update_time == 0) - return; - last_update_time = cfs_rq_last_update_time(cfs_rq); - - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + sync_entity_load_avg(se); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } @@ -4038,7 +4459,16 @@ static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ -static inline void update_load_avg(struct sched_entity *se, int update_tg) {} +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +{ + return 0; +} + +#define UPDATE_TG 0x0 +#define SKIP_AGE_LOAD 0x0 + +static inline void update_load_avg(struct sched_entity *se, int not_used1){} static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void @@ -4187,9 +4617,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + update_load_avg(se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); + update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); @@ -4262,6 +4693,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + + /* + * When dequeuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Substract its load from the cfs_rq->runnable_avg. + * - Substract its previous weight from cfs_rq->load.weight. + * - For group entity, update its weight to reflect the new share + * of its group cfs_rq. + */ + update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); update_stats_dequeue(cfs_rq, se); @@ -4297,7 +4738,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) return_cfs_rq_runtime(cfs_rq); update_min_vruntime(cfs_rq); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); } /* @@ -4352,7 +4793,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); - update_load_avg(se, 1); + update_load_avg(se, UPDATE_TG); } update_stats_curr_start(cfs_rq, se); @@ -4468,8 +4909,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) /* * Ensure that runnable average is periodically updated. */ - update_load_avg(curr, 1); - update_cfs_shares(cfs_rq); + update_load_avg(curr, UPDATE_TG); + update_cfs_shares(curr); #ifdef CONFIG_SCHED_HRTICK /* @@ -5372,29 +5813,13 @@ static inline void hrtick_update(struct rq *rq) #endif #ifdef CONFIG_SMP +static bool __cpu_overutilized(int cpu, int delta); static bool cpu_overutilized(int cpu); -static inline unsigned long boosted_cpu_util(int cpu); +unsigned long boosted_cpu_util(int cpu); #else -#define boosted_cpu_util(cpu) cpu_util(cpu) +#define boosted_cpu_util(cpu) cpu_util_freq(cpu) #endif -#if defined(CONFIG_SMP) && defined(CONFIG_CPU_FREQ_GOV_SCHED) -static void update_capacity_of(int cpu) -{ - unsigned long req_cap; - - if (!sched_freq()) - return; - - /* Convert scale-invariant capacity to cpu. */ - req_cap = boosted_cpu_util(cpu); - req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); - set_cfs_cpu_capacity(cpu, true, req_cap); -} -#else -#define update_capacity_of(X) do {} while(0) -#endif /* SMP and CPU_FREQ_GOV_SCHED */ - /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -5407,9 +5832,16 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; #ifdef CONFIG_SMP int task_new = flags & ENQUEUE_WAKEUP_NEW; - int task_wakeup = flags & ENQUEUE_WAKEUP; #endif + /* + * If in_iowait is set, the code below may not trigger any cpufreq + * utilization updates, so do it here explicitly with the IOWAIT flag + * passed. + */ + if (p->in_iowait) + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); + for_each_sched_entity(se) { if (se->on_rq) break; @@ -5421,7 +5853,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running increment below. - */ + */ if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; @@ -5438,8 +5870,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); - update_cfs_shares(cfs_rq); + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); } if (!se) { @@ -5474,19 +5906,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) rq->rd->overutilized = true; trace_sched_overutilized(true); } - - } - - if (!se) { - /* - * We want to potentially trigger a freq switch - * request only for tasks that are waking up; this is - * because we get here also during load balancing, but - * in these cases it seems wise to trigger as single - * request after load balancing is done. - */ - if (task_new || task_wakeup) - update_capacity_of(cpu_of(rq)); } #endif /* CONFIG_SMP */ @@ -5544,8 +5963,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (cfs_rq_throttled(cfs_rq)) break; - update_load_avg(se, 1); - update_cfs_shares(cfs_rq); + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); } if (!se) { @@ -5564,23 +5983,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ schedtune_dequeue_task(p, cpu_of(rq)); - if (!se) { - /* - * We want to potentially trigger a freq switch - * request only for tasks that are going to sleep; - * this is because we get here also during load - * balancing, but in these cases it seems wise to - * trigger as single request after load balancing is - * done. - */ - if (task_sleep) { - if (rq->cfs.nr_running) - update_capacity_of(cpu_of(rq)); - else if (sched_freq()) - set_cfs_cpu_capacity(cpu_of(rq), false, 0); - } - } - #endif /* CONFIG_SMP */ hrtick_update(rq); @@ -6000,6 +6402,7 @@ struct energy_env { int util_delta; int src_cpu; int dst_cpu; + int trg_cpu; int energy; int payoff; struct task_struct *task; @@ -6016,11 +6419,14 @@ struct energy_env { } cap; }; +static int cpu_util_wake(int cpu, struct task_struct *p); + /* * __cpu_norm_util() returns the cpu util relative to a specific capacity, - * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for - * energy calculations. Using the scale-invariant util returned by - * cpu_util() and approximating scale-invariant util by: + * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for + * energy calculations. + * + * Since util is a scale-invariant utilization defined as: * * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time * @@ -6030,34 +6436,32 @@ struct energy_env { * * norm_util = running_time/time ~ util/capacity */ -static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta) +static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity) { - int util = __cpu_util(cpu, delta); - if (util >= capacity) return SCHED_CAPACITY_SCALE; return (util << SCHED_CAPACITY_SHIFT)/capacity; } -static int calc_util_delta(struct energy_env *eenv, int cpu) -{ - if (cpu == eenv->src_cpu) - return -eenv->util_delta; - if (cpu == eenv->dst_cpu) - return eenv->util_delta; - return 0; -} - -static -unsigned long group_max_util(struct energy_env *eenv) +static unsigned long group_max_util(struct energy_env *eenv) { - int i, delta; unsigned long max_util = 0; + unsigned long util; + int cpu; + + for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) { + util = cpu_util_wake(cpu, eenv->task); + + /* + * If we are looking at the target CPU specified by the eenv, + * then we should add the (estimated) utilization of the task + * assuming we will wake it up on that CPU. + */ + if (unlikely(cpu == eenv->trg_cpu)) + util += eenv->util_delta; - for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) { - delta = calc_util_delta(eenv, i); - max_util = max(max_util, __cpu_util(i, delta)); + max_util = max(max_util, util); } return max_util; @@ -6065,49 +6469,63 @@ unsigned long group_max_util(struct energy_env *eenv) /* * group_norm_util() returns the approximated group util relative to it's - * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in - * energy calculations. Since task executions may or may not overlap in time in - * the group the true normalized util is between max(cpu_norm_util(i)) and - * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The - * latter is used as the estimate as it leads to a more pessimistic energy + * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use + * in energy calculations. + * + * Since task executions may or may not overlap in time in the group the true + * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i)) + * when iterating over all CPUs in the group. + * The latter estimate is used as it leads to a more pessimistic energy * estimate (more busy). */ static unsigned long group_norm_util(struct energy_env *eenv, struct sched_group *sg) { - int i, delta; - unsigned long util_sum = 0; unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap; + unsigned long util, util_sum = 0; + int cpu; - for_each_cpu(i, sched_group_cpus(sg)) { - delta = calc_util_delta(eenv, i); - util_sum += __cpu_norm_util(i, capacity, delta); + for_each_cpu(cpu, sched_group_cpus(sg)) { + util = cpu_util_wake(cpu, eenv->task); + + /* + * If we are looking at the target CPU specified by the eenv, + * then we should add the (estimated) utilization of the task + * assuming we will wake it up on that CPU. + */ + if (unlikely(cpu == eenv->trg_cpu)) + util += eenv->util_delta; + + util_sum += __cpu_norm_util(util, capacity); } - if (util_sum > SCHED_CAPACITY_SCALE) - return SCHED_CAPACITY_SCALE; - return util_sum; + return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE); } static int find_new_capacity(struct energy_env *eenv, const struct sched_group_energy * const sge) { - int idx; + int idx, max_idx = sge->nr_cap_states - 1; unsigned long util = group_max_util(eenv); + /* default is max_cap if we don't find a match */ + eenv->cap_idx = max_idx; + for (idx = 0; idx < sge->nr_cap_states; idx++) { - if (sge->cap_states[idx].cap >= util) + if (sge->cap_states[idx].cap >= util) { + eenv->cap_idx = idx; break; + } } - eenv->cap_idx = idx; - - return idx; + return eenv->cap_idx; } -static int group_idle_state(struct sched_group *sg) +static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) { int i, state = INT_MAX; + int src_in_grp, dst_in_grp; + long grp_util = 0; /* Find the shallowest idle state in the sched group. */ for_each_cpu(i, sched_group_cpus(sg)) @@ -6116,6 +6534,53 @@ static int group_idle_state(struct sched_group *sg) /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */ state++; + src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg)); + dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg)); + if (src_in_grp == dst_in_grp) { + /* both CPUs under consideration are in the same group or not in + * either group, migration should leave idle state the same. + */ + goto end; + } + + /* + * Try to estimate if a deeper idle state is + * achievable when we move the task. + */ + for_each_cpu(i, sched_group_cpus(sg)) { + grp_util += cpu_util_wake(i, eenv->task); + if (unlikely(i == eenv->trg_cpu)) + grp_util += eenv->util_delta; + } + + if (grp_util <= + ((long)sg->sgc->max_capacity * (int)sg->group_weight)) { + /* after moving, this group is at most partly + * occupied, so it should have some idle time. + */ + int max_idle_state_idx = sg->sge->nr_idle_states - 2; + int new_state = grp_util * max_idle_state_idx; + if (grp_util <= 0) + /* group will have no util, use lowest state */ + new_state = max_idle_state_idx + 1; + else { + /* for partially idle, linearly map util to idle + * states, excluding the lowest one. This does not + * correspond to the state we expect to enter in + * reality, but an indication of what might happen. + */ + new_state = min(max_idle_state_idx, (int) + (new_state / sg->sgc->max_capacity)); + new_state = max_idle_state_idx - new_state; + } + state = new_state; + } else { + /* After moving, the group will be fully occupied + * so assume it will not be idle at all. + */ + state = 0; + } +end: return state; } @@ -6131,39 +6596,43 @@ static int group_idle_state(struct sched_group *sg) */ static int sched_group_energy(struct energy_env *eenv) { - struct sched_domain *sd; - int cpu, total_energy = 0; struct cpumask visit_cpus; - struct sched_group *sg; + u64 total_energy = 0; + int cpu_count; WARN_ON(!eenv->sg_top->sge); cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top)); + /* If a cpu is hotplugged in while we are in this function, + * it does not appear in the existing visit_cpus mask + * which came from the sched_group pointer of the + * sched_domain pointed at by sd_ea for either the prev + * or next cpu and was dereferenced in __energy_diff. + * Since we will dereference sd_scs later as we iterate + * through the CPUs we expect to visit, new CPUs can + * be present which are not in the visit_cpus mask. + * Guard this with cpu_count. + */ + cpu_count = cpumask_weight(&visit_cpus); while (!cpumask_empty(&visit_cpus)) { struct sched_group *sg_shared_cap = NULL; - - cpu = cpumask_first(&visit_cpus); + int cpu = cpumask_first(&visit_cpus); + struct sched_domain *sd; /* * Is the group utilization affected by cpus outside this * sched_group? + * This sd may have groups with cpus which were not present + * when we took visit_cpus. */ sd = rcu_dereference(per_cpu(sd_scs, cpu)); - if (!sd) - /* - * We most probably raced with hotplug; returning a - * wrong energy estimation is better than entering an - * infinite loop. - */ - return -EINVAL; - - if (sd->parent) + if (sd && sd->parent) sg_shared_cap = sd->parent->groups; for_each_domain(cpu, sd) { - sg = sd->groups; + struct sched_group *sg = sd->groups; /* Has this sched_domain already been visited? */ if (sd->child && group_first_cpu(sg) != cpu) @@ -6183,43 +6652,69 @@ static int sched_group_energy(struct energy_env *eenv) if (sg->group_weight == 1) { /* Remove capacity of src CPU (before task move) */ - if (eenv->util_delta == 0 && + if (eenv->trg_cpu == eenv->src_cpu && cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) { eenv->cap.before = sg->sge->cap_states[cap_idx].cap; eenv->cap.delta -= eenv->cap.before; } /* Add capacity of dst CPU (after task move) */ - if (eenv->util_delta != 0 && + if (eenv->trg_cpu == eenv->dst_cpu && cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) { eenv->cap.after = sg->sge->cap_states[cap_idx].cap; eenv->cap.delta += eenv->cap.after; } } - idle_idx = group_idle_state(sg); + idle_idx = group_idle_state(eenv, sg); group_util = group_norm_util(eenv, sg); - sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) - >> SCHED_CAPACITY_SHIFT; + + sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power); sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) - * sg->sge->idle_states[idle_idx].power) - >> SCHED_CAPACITY_SHIFT; + * sg->sge->idle_states[idle_idx].power); total_energy += sg_busy_energy + sg_idle_energy; - if (!sd->child) + if (!sd->child) { + /* + * cpu_count here is the number of + * cpus we expect to visit in this + * calculation. If we race against + * hotplug, we can have extra cpus + * added to the groups we are + * iterating which do not appear in + * the visit_cpus mask. In that case + * we are not able to calculate energy + * without restarting so we will bail + * out and use prev_cpu this time. + */ + if (!cpu_count) + return -EINVAL; cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg)); + cpu_count--; + } if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top))) goto next_cpu; } while (sg = sg->next, sg != sd->groups); } + + /* + * If we raced with hotplug and got an sd NULL-pointer; + * returning a wrong energy estimation is better than + * entering an infinite loop. + * Specifically: If a cpu is unplugged after we took + * the visit_cpus mask, it no longer has an sd_scs + * pointer, so when we dereference it, we get NULL. + */ + if (cpumask_test_cpu(cpu, &visit_cpus)) + return -EINVAL; next_cpu: cpumask_clear_cpu(cpu, &visit_cpus); continue; } - eenv->energy = total_energy; + eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT; return 0; } @@ -6228,6 +6723,8 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } +static inline unsigned long task_util(struct task_struct *p); + /* * energy_diff(): Estimate the energy impact of changing the utilization * distribution. eenv specifies the change: utilisation amount, source, and @@ -6240,13 +6737,16 @@ static inline int __energy_diff(struct energy_env *eenv) struct sched_domain *sd; struct sched_group *sg; int sd_cpu = -1, energy_before = 0, energy_after = 0; + int diff, margin; struct energy_env eenv_before = { - .util_delta = 0, + .util_delta = task_util(eenv->task), .src_cpu = eenv->src_cpu, .dst_cpu = eenv->dst_cpu, + .trg_cpu = eenv->src_cpu, .nrg = { 0, 0, 0, 0}, .cap = { 0, 0, 0 }, + .task = eenv->task, }; if (eenv->src_cpu == eenv->dst_cpu) @@ -6282,12 +6782,22 @@ static inline int __energy_diff(struct energy_env *eenv) eenv->nrg.after = energy_after; eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; eenv->payoff = 0; - +#ifndef CONFIG_SCHED_TUNE trace_sched_energy_diff(eenv->task, eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, eenv->cap.before, eenv->cap.after, eenv->cap.delta, eenv->nrg.delta, eenv->payoff); +#endif + /* + * Dead-zone margin preventing too many migrations. + */ + + margin = eenv->nrg.before >> 6; /* ~1.56% */ + + diff = eenv->nrg.after - eenv->nrg.before; + + eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff; return eenv->nrg.diff; } @@ -6296,29 +6806,42 @@ static inline int __energy_diff(struct energy_env *eenv) struct target_nrg schedtune_target_nrg; +#ifdef CONFIG_CGROUP_SCHEDTUNE +extern bool schedtune_initialized; +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + /* * System energy normalization - * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], + * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE], * corresponding to the specified energy variation. */ static inline int normalize_energy(int energy_diff) { u32 normalized_nrg; + +#ifdef CONFIG_CGROUP_SCHEDTUNE + /* during early setup, we don't know the extents */ + if (unlikely(!schedtune_initialized)) + return energy_diff < 0 ? -1 : 1 ; +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + #ifdef CONFIG_SCHED_DEBUG + { int max_delta; /* Check for boundaries */ max_delta = schedtune_target_nrg.max_power; max_delta -= schedtune_target_nrg.min_power; WARN_ON(abs(energy_diff) >= max_delta); + } #endif /* Do scaling using positive numbers to increase the range */ normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; /* Scale by energy magnitude */ - normalized_nrg <<= SCHED_LOAD_SHIFT; + normalized_nrg <<= SCHED_CAPACITY_SHIFT; /* Normalize on max energy for target platform */ normalized_nrg = reciprocal_divide( @@ -6337,8 +6860,14 @@ energy_diff(struct energy_env *eenv) __energy_diff(eenv); /* Return energy diff when boost margin is 0 */ - if (boost == 0) + if (boost == 0) { + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + 0, -eenv->nrg.diff); return eenv->nrg.diff; + } /* Compute normalized energy diff */ nrg_delta = normalize_energy(eenv->nrg.diff); @@ -6349,6 +6878,12 @@ energy_diff(struct energy_env *eenv) eenv->cap.delta, eenv->task); + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + eenv->nrg.delta, eenv->payoff); + /* * When SchedTune is enabled, the energy_diff() function will return * the computed energy payoff value. Since the energy_diff() return @@ -6375,31 +6910,34 @@ energy_diff(struct energy_env *eenv) * being client/server, worker/dispatcher, interrupt source or whatever is * irrelevant, spread criteria is apparent partner count exceeds socket size. */ -static int wake_wide(struct task_struct *p) +static int wake_wide(struct task_struct *p, int sibling_count_hint) { unsigned int master = current->wakee_flips; unsigned int slave = p->wakee_flips; - int factor = this_cpu_read(sd_llc_size); + int llc_size = this_cpu_read(sd_llc_size); + + if (sibling_count_hint >= llc_size) + return 1; if (master < slave) swap(master, slave); - if (slave < factor || master < slave * factor) + if (slave < llc_size || master < slave * llc_size) return 0; return 1; } -static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, + int prev_cpu, int sync) { s64 this_load, load; s64 this_eff_load, prev_eff_load; - int idx, this_cpu, prev_cpu; + int idx, this_cpu; struct task_group *tg; unsigned long weight; int balanced; idx = sd->wake_idx; this_cpu = smp_processor_id(); - prev_cpu = task_cpu(p); load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); @@ -6459,8 +6997,6 @@ static inline unsigned long task_util(struct task_struct *p) return p->se.avg.util_avg; } -unsigned int capacity_margin = 1280; /* ~20% margin */ - static inline unsigned long boosted_task_util(struct task_struct *task); static inline bool __task_fits(struct task_struct *p, int cpu, int util) @@ -6486,18 +7022,20 @@ static inline bool task_fits_max(struct task_struct *p, int cpu) return __task_fits(p, cpu, 0); } -static inline bool task_fits_spare(struct task_struct *p, int cpu) +static bool __cpu_overutilized(int cpu, int delta) { - return __task_fits(p, cpu, cpu_util(cpu)); + return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin); } static bool cpu_overutilized(int cpu) { - return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); + return __cpu_overutilized(cpu, 0); } #ifdef CONFIG_SCHED_TUNE +struct reciprocal_value schedtune_spc_rdiv; + static long schedtune_margin(unsigned long signal, long boost) { @@ -6508,29 +7046,16 @@ schedtune_margin(unsigned long signal, long boost) * * The Boost (B) value is used to compute a Margin (M) which is * proportional to the complement of the original Signal (S): - * M = B * (SCHED_LOAD_SCALE - S), if B is positive - * M = B * S, if B is negative + * M = B * (SCHED_CAPACITY_SCALE - S) * The obtained M could be used by the caller to "boost" S. */ if (boost >= 0) { - margin = SCHED_LOAD_SCALE - signal; + margin = SCHED_CAPACITY_SCALE - signal; margin *= boost; } else margin = -signal * boost; - /* - * Fast integer division by constant: - * Constant : (C) = 100 - * Precision : 0.1% (P) = 0.1 - * Reference : C * 100 / P (R) = 100000 - * - * Thus: - * Shift bits : ceil(log(R,2)) (S) = 17 - * Mult const : round(2^S/C) (M) = 1311 - * - * - */ - margin *= 1311; - margin >>= 17; + + margin = reciprocal_divide(margin, schedtune_spc_rdiv); if (boost < 0) margin *= -1; @@ -6580,10 +7105,10 @@ schedtune_task_margin(struct task_struct *task) #endif /* CONFIG_SCHED_TUNE */ -static inline unsigned long +unsigned long boosted_cpu_util(int cpu) { - unsigned long util = cpu_util(cpu); + unsigned long util = cpu_util_freq(cpu); long margin = schedtune_cpu_margin(util, cpu); trace_sched_boost_cpu(cpu, util, margin); @@ -6602,19 +7127,25 @@ boosted_task_util(struct task_struct *task) return util + margin; } +static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) +{ + return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. + * + * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *fit_group = NULL, *spare_group = NULL; - unsigned long min_load = ULONG_MAX, this_load = 0; - unsigned long fit_capacity = ULONG_MAX; - unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE; + struct sched_group *most_spare_sg = NULL; + unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX; + unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -6622,7 +7153,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load_idx = sd->wake_idx; do { - unsigned long load, avg_load, spare_capacity; + unsigned long load, avg_load, spare_cap, max_spare_cap; int local_group; int i; @@ -6634,8 +7165,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); - /* Tally up the load of all CPUs in the group */ + /* + * Tally up the load of all CPUs in the group and find + * the group containing the CPU with most spare capacity. + */ avg_load = 0; + max_spare_cap = 0; for_each_cpu(i, sched_group_cpus(group)) { /* Bias balancing toward cpus of our domain */ @@ -6646,24 +7181,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, avg_load += load; - /* - * Look for most energy-efficient group that can fit - * that can fit the task. - */ - if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) { - fit_capacity = capacity_of(i); - fit_group = group; - } + spare_cap = capacity_spare_wake(i, p); - /* - * Look for group which has most spare capacity on a - * single cpu. - */ - spare_capacity = capacity_of(i) - cpu_util(i); - if (spare_capacity > max_spare_capacity) { - max_spare_capacity = spare_capacity; - spare_group = group; - } + if (spare_cap > max_spare_cap) + max_spare_cap = spare_cap; } /* Adjust by relative CPU capacity of the group */ @@ -6671,28 +7192,51 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (local_group) { this_load = avg_load; - } else if (avg_load < min_load) { - min_load = avg_load; - idlest = group; + this_spare = max_spare_cap; + } else { + if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } + + if (most_spare < max_spare_cap) { + most_spare = max_spare_cap; + most_spare_sg = group; + } } } while (group = group->next, group != sd->groups); - if (fit_group) - return fit_group; + /* + * The cross-over point between using spare capacity or least load + * is too conservative for high utilization tasks on partially + * utilized systems if we require spare_capacity > task_util(p), + * so we allow for some task stuffing by using + * spare_capacity > task_util(p)/2. + * + * Spare capacity can't be used for fork because the utilization has + * not been set yet, we must first select a rq to compute the initial + * utilization. + */ + if (sd_flag & SD_BALANCE_FORK) + goto skip_spare; - if (spare_group) - return spare_group; + if (this_spare > task_util(p) / 2 && + imbalance*this_spare > 100*most_spare) + return NULL; + else if (most_spare > task_util(p) / 2) + return most_spare_sg; +skip_spare: if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; } /* - * find_idlest_cpu - find the idlest cpu among the cpus in group. + * find_idlest_group_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -6701,9 +7245,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) int shallowest_idle_cpu = -1; int i; + /* Check if we have any choice: */ + if (group->group_weight == 1) + return cpumask_first(sched_group_cpus(group)); + /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - if (task_fits_spare(p, i)) { + if (idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { @@ -6715,8 +7263,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) min_exit_latency = idle->exit_latency; latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; - } else if (idle_cpu(i) && - (!idle || idle->exit_latency == min_exit_latency) && + } else if ((!idle || idle->exit_latency == min_exit_latency) && rq->idle_stamp > latest_idle_timestamp) { /* * If equal or no active idle state, then @@ -6725,13 +7272,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) */ latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; - } else if (shallowest_idle_cpu == -1) { - /* - * If we haven't found an idle CPU yet - * pick a non-idle one that can fit the task as - * fallback. - */ - shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(i); @@ -6743,29 +7283,99 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) } return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; + } + +static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, + int cpu, int prev_cpu, int sd_flag) +{ + int new_cpu = cpu; + int wu = sd_flag & SD_BALANCE_WAKE; + int cas_cpu = -1; + + if (wu) { + schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts); + schedstat_inc(this_rq(), eas_stats.cas_attempts); + } + + if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) + return prev_cpu; + + while (sd) { + struct sched_group *group; + struct sched_domain *tmp; + int weight; + + if (wu) + schedstat_inc(sd, eas_stats.cas_attempts); + + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, p, cpu, sd_flag); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_group_cpu(group, p, cpu); + if (new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = cas_cpu = new_cpu; + weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + if (wu && (cas_cpu >= 0)) { + schedstat_inc(p, se.statistics.nr_wakeups_cas_count); + schedstat_inc(this_rq(), eas_stats.cas_count); + } + + return new_cpu; } /* * Try and locate an idle CPU in the sched_domain. */ -static int select_idle_sibling(struct task_struct *p, int target) +static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; struct sched_group *sg; - int i = task_cpu(p); - int best_idle = -1; - int best_idle_cstate = -1; - int best_idle_capacity = INT_MAX; + int best_idle_cpu = -1; + int best_idle_cstate = INT_MAX; + unsigned long best_idle_capacity = ULONG_MAX; + + schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts); + schedstat_inc(this_rq(), eas_stats.sis_attempts); if (!sysctl_sched_cstate_aware) { - if (idle_cpu(target)) + if (idle_cpu(target)) { + schedstat_inc(p, se.statistics.nr_wakeups_sis_idle); + schedstat_inc(this_rq(), eas_stats.sis_idle); return target; + } /* * If the prevous cpu is cache affine and idle, don't be stupid. */ - if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) - return i; + if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) { + schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine); + schedstat_inc(this_rq(), eas_stats.sis_cache_affine); + return prev; + } } if (!(current->flags & PF_WAKE_UP_IDLE) && @@ -6779,24 +7389,30 @@ static int select_idle_sibling(struct task_struct *p, int target) for_each_lower_domain(sd) { sg = sd->groups; do { + int i; if (!cpumask_intersects(sched_group_cpus(sg), tsk_cpus_allowed(p))) goto next; if (sysctl_sched_cstate_aware) { for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { - struct rq *rq = cpu_rq(i); - int idle_idx = idle_get_state_idx(rq); + int idle_idx = idle_get_state_idx(cpu_rq(i)); unsigned long new_usage = boosted_task_util(p); unsigned long capacity_orig = capacity_orig_of(i); + if (new_usage > capacity_orig || !idle_cpu(i)) goto next; - if (i == target && new_usage <= capacity_curr_of(target)) + if (i == target && new_usage <= capacity_curr_of(target)) { + schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap); + schedstat_inc(this_rq(), eas_stats.sis_suff_cap); + schedstat_inc(sd, eas_stats.sis_suff_cap); return target; + } - if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) { - best_idle = i; + if (idle_idx < best_idle_cstate && + capacity_orig <= best_idle_capacity) { + best_idle_cpu = i; best_idle_cstate = idle_idx; best_idle_capacity = capacity_orig; } @@ -6809,231 +7425,462 @@ static int select_idle_sibling(struct task_struct *p, int target) target = cpumask_first_and(sched_group_cpus(sg), tsk_cpus_allowed(p)); + schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu); + schedstat_inc(this_rq(), eas_stats.sis_idle_cpu); + schedstat_inc(sd, eas_stats.sis_idle_cpu); goto done; } next: sg = sg->next; } while (sg != sd->groups); } - if (best_idle > 0) - target = best_idle; + + if (best_idle_cpu >= 0) + target = best_idle_cpu; done: + schedstat_inc(p, se.statistics.nr_wakeups_sis_count); + schedstat_inc(this_rq(), eas_stats.sis_count); + return target; } -static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle) +/* + * cpu_util_wake: Compute cpu utilization with any contributions from + * the waking task p removed. check_for_migration() looks for a better CPU of + * rq->curr. For that case we should return cpu util with contributions from + * currently running task p removed. + */ +static int cpu_util_wake(int cpu, struct task_struct *p) { - int iter_cpu; - int target_cpu = -1; - int target_util = 0; - int backup_capacity = 0; - int best_idle_cpu = -1; - int best_idle_cstate = INT_MAX; - int backup_cpu = -1; - unsigned long task_util_boosted, new_util; - - task_util_boosted = boosted_task_util(p); - for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) { - int cur_capacity; - struct rq *rq; - int idle_idx; - - /* - * Iterate from higher cpus for boosted tasks. - */ - int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu; - - if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p))) - continue; + unsigned long util, capacity; - /* - * p's blocked utilization is still accounted for on prev_cpu - * so prev_cpu will receive a negative bias due to the double - * accounting. However, the blocked utilization may be zero. - */ - new_util = cpu_util(i) + task_util_boosted; - - /* - * Ensure minimum capacity to grant the required boost. - * The target CPU can be already at a capacity level higher - * than the one required to boost the task. - */ - if (new_util > capacity_orig_of(i)) - continue; - - /* - * Unconditionally favoring tasks that prefer idle cpus to - * improve latency. - */ - if (idle_cpu(i) && prefer_idle) { - if (best_idle_cpu < 0) - best_idle_cpu = i; - continue; - } +#ifdef CONFIG_SCHED_WALT + /* + * WALT does not decay idle tasks in the same manner + * as PELT, so it makes little sense to subtract task + * utilization from cpu utilization. Instead just use + * cpu_util for this case. + */ + if (!walt_disabled && sysctl_sched_use_walt_cpu_util && + p->state == TASK_WAKING) + return cpu_util(cpu); +#endif + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !p->se.avg.last_update_time) + return cpu_util(cpu); - cur_capacity = capacity_curr_of(i); - rq = cpu_rq(i); - idle_idx = idle_get_state_idx(rq); + capacity = capacity_orig_of(cpu); + util = max_t(long, cpu_util(cpu) - task_util(p), 0); - if (new_util < cur_capacity) { - if (cpu_rq(i)->nr_running) { - if (prefer_idle) { - /* Find a target cpu with highest - * utilization. - */ - if (target_util == 0 || - target_util < new_util) { - target_cpu = i; - target_util = new_util; - } - } else { - /* Find a target cpu with lowest - * utilization. - */ - if (target_util == 0 || - target_util > new_util) { - target_cpu = i; - target_util = new_util; - } - } - } else if (!prefer_idle) { - if (best_idle_cpu < 0 || - (sysctl_sched_cstate_aware && - best_idle_cstate > idle_idx)) { - best_idle_cstate = idle_idx; - best_idle_cpu = i; - } - } - } else if (backup_capacity == 0 || - backup_capacity > cur_capacity) { - // Find a backup cpu with least capacity. - backup_capacity = cur_capacity; - backup_cpu = i; - } - } + return (util >= capacity) ? capacity : util; +} - if (prefer_idle && best_idle_cpu >= 0) - target_cpu = best_idle_cpu; - else if (target_cpu < 0) - target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu; +static int start_cpu(bool boosted) +{ + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; - return target_cpu; + return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu; } -static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) +static inline int find_best_target(struct task_struct *p, int *backup_cpu, + bool boosted, bool prefer_idle) { + unsigned long best_idle_min_cap_orig = ULONG_MAX; + unsigned long min_util = boosted_task_util(p); + unsigned long target_capacity = ULONG_MAX; + unsigned long min_wake_util = ULONG_MAX; + unsigned long target_max_spare_cap = 0; + unsigned long best_active_util = ULONG_MAX; + int best_idle_cstate = INT_MAX; struct sched_domain *sd; - struct sched_group *sg, *sg_target; - int target_max_cap = INT_MAX; - int target_cpu = task_cpu(p); - unsigned long task_util_boosted, new_util; - int i; + struct sched_group *sg; + int best_active_cpu = -1; + int best_idle_cpu = -1; + int target_cpu = -1; + int cpu, i; - if (sysctl_sched_sync_hint_enable && sync) { - int cpu = smp_processor_id(); - cpumask_t search_cpus; - cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask); - if (cpumask_test_cpu(cpu, &search_cpus)) - return cpu; - } + *backup_cpu = -1; - sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p))); + schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts); + schedstat_inc(this_rq(), eas_stats.fbt_attempts); - if (!sd) - return target; + /* Find start CPU based on boost value */ + cpu = start_cpu(boosted); + if (cpu < 0) { + schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu); + schedstat_inc(this_rq(), eas_stats.fbt_no_cpu); + return -1; + } - sg = sd->groups; - sg_target = sg; + /* Find SD for the start CPU */ + sd = rcu_dereference(per_cpu(sd_ea, cpu)); + if (!sd) { + schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd); + schedstat_inc(this_rq(), eas_stats.fbt_no_sd); + return -1; + } - if (sysctl_sched_is_big_little) { + /* Scan CPUs in all SDs */ + sg = sd->groups; + do { + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { + unsigned long capacity_curr = capacity_curr_of(i); + unsigned long capacity_orig = capacity_orig_of(i); + unsigned long wake_util, new_util; - /* - * Find group with sufficient capacity. We only get here if no cpu is - * overutilized. We may end up overutilizing a cpu by adding the task, - * but that should not be any worse than select_idle_sibling(). - * load_balance() should sort it out later as we get above the tipping - * point. - */ - do { - /* Assuming all cpus are the same in group */ - int max_cap_cpu = group_first_cpu(sg); + if (!cpu_online(i)) + continue; - /* - * Assume smaller max capacity means more energy-efficient. - * Ideally we should query the energy model for the right - * answer but it easily ends up in an exhaustive search. - */ - if (capacity_of(max_cap_cpu) < target_max_cap && - task_fits_max(p, max_cap_cpu)) { - sg_target = sg; - target_max_cap = capacity_of(max_cap_cpu); - } - } while (sg = sg->next, sg != sd->groups); + if (walt_cpu_high_irqload(i)) + continue; - task_util_boosted = boosted_task_util(p); - /* Find cpu with sufficient capacity */ - for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { /* * p's blocked utilization is still accounted for on prev_cpu * so prev_cpu will receive a negative bias due to the double * accounting. However, the blocked utilization may be zero. */ - new_util = cpu_util(i) + task_util_boosted; + wake_util = cpu_util_wake(i, p); + new_util = wake_util + task_util(p); /* * Ensure minimum capacity to grant the required boost. * The target CPU can be already at a capacity level higher * than the one required to boost the task. */ - if (new_util > capacity_orig_of(i)) + new_util = max(min_util, new_util); + if (new_util > capacity_orig) continue; - if (new_util < capacity_curr_of(i)) { - target_cpu = i; - if (cpu_rq(i)->nr_running) - break; + /* + * Case A) Latency sensitive tasks + * + * Unconditionally favoring tasks that prefer idle CPU to + * improve latency. + * + * Looking for: + * - an idle CPU, whatever its idle_state is, since + * the first CPUs we explore are more likely to be + * reserved for latency sensitive tasks. + * - a non idle CPU where the task fits in its current + * capacity and has the maximum spare capacity. + * - a non idle CPU with lower contention from other + * tasks and running at the lowest possible OPP. + * + * The last two goals tries to favor a non idle CPU + * where the task can run as if it is "almost alone". + * A maximum spare capacity CPU is favoured since + * the task already fits into that CPU's capacity + * without waiting for an OPP chance. + * + * The following code path is the only one in the CPUs + * exploration loop which is always used by + * prefer_idle tasks. It exits the loop with wither a + * best_active_cpu or a target_cpu which should + * represent an optimal choice for latency sensitive + * tasks. + */ + if (prefer_idle) { + + /* + * Case A.1: IDLE CPU + * Return the first IDLE CPU we find. + */ + if (idle_cpu(i)) { + schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle); + schedstat_inc(this_rq(), eas_stats.fbt_pref_idle); + + trace_sched_find_best_target(p, + prefer_idle, min_util, + cpu, best_idle_cpu, + best_active_cpu, i); + + return i; + } + + /* + * Case A.2: Target ACTIVE CPU + * Favor CPUs with max spare capacity. + */ + if ((capacity_curr > new_util) && + (capacity_orig - new_util > target_max_spare_cap)) { + target_max_spare_cap = capacity_orig - new_util; + target_cpu = i; + continue; + } + if (target_cpu != -1) + continue; + + + /* + * Case A.3: Backup ACTIVE CPU + * Favor CPUs with: + * - lower utilization due to other tasks + * - lower utilization with the task in + */ + if (wake_util > min_wake_util) + continue; + if (new_util > best_active_util) + continue; + min_wake_util = wake_util; + best_active_util = new_util; + best_active_cpu = i; + continue; } - /* cpu has capacity at higher OPP, keep it as fallback */ - if (target_cpu == task_cpu(p)) - target_cpu = i; + /* + * Enforce EAS mode + * + * For non latency sensitive tasks, skip CPUs that + * will be overutilized by moving the task there. + * + * The goal here is to remain in EAS mode as long as + * possible at least for !prefer_idle tasks. + */ + if ((new_util * capacity_margin) > + (capacity_orig * SCHED_CAPACITY_SCALE)) + continue; + + /* + * Case B) Non latency sensitive tasks on IDLE CPUs. + * + * Find an optimal backup IDLE CPU for non latency + * sensitive tasks. + * + * Looking for: + * - minimizing the capacity_orig, + * i.e. preferring LITTLE CPUs + * - favoring shallowest idle states + * i.e. avoid to wakeup deep-idle CPUs + * + * The following code path is used by non latency + * sensitive tasks if IDLE CPUs are available. If at + * least one of such CPUs are available it sets the + * best_idle_cpu to the most suitable idle CPU to be + * selected. + * + * If idle CPUs are available, favour these CPUs to + * improve performances by spreading tasks. + * Indeed, the energy_diff() computed by the caller + * will take care to ensure the minimization of energy + * consumptions without affecting performance. + */ + if (idle_cpu(i)) { + int idle_idx = idle_get_state_idx(cpu_rq(i)); + + /* Select idle CPU with lower cap_orig */ + if (capacity_orig > best_idle_min_cap_orig) + continue; + + /* + * Skip CPUs in deeper idle state, but only + * if they are also less energy efficient. + * IOW, prefer a deep IDLE LITTLE CPU vs a + * shallow idle big CPU. + */ + if (sysctl_sched_cstate_aware && + best_idle_cstate <= idle_idx) + continue; + + /* Keep track of best idle CPU */ + best_idle_min_cap_orig = capacity_orig; + best_idle_cstate = idle_idx; + best_idle_cpu = i; + continue; + } + + /* + * Case C) Non latency sensitive tasks on ACTIVE CPUs. + * + * Pack tasks in the most energy efficient capacities. + * + * This task packing strategy prefers more energy + * efficient CPUs (i.e. pack on smaller maximum + * capacity CPUs) while also trying to spread tasks to + * run them all at the lower OPP. + * + * This assumes for example that it's more energy + * efficient to run two tasks on two CPUs at a lower + * OPP than packing both on a single CPU but running + * that CPU at an higher OPP. + * + * Thus, this case keep track of the CPU with the + * smallest maximum capacity and highest spare maximum + * capacity. + */ + + /* Favor CPUs with smaller capacity */ + if (capacity_orig > target_capacity) + continue; + + /* Favor CPUs with maximum spare capacity */ + if ((capacity_orig - new_util) < target_max_spare_cap) + continue; + + target_max_spare_cap = capacity_orig - new_util; + target_capacity = capacity_orig; + target_cpu = i; } - } else { - /* - * Find a cpu with sufficient capacity - */ + + } while (sg = sg->next, sg != sd->groups); + + /* + * For non latency sensitive tasks, cases B and C in the previous loop, + * we pick the best IDLE CPU only if we was not able to find a target + * ACTIVE CPU. + * + * Policies priorities: + * + * - prefer_idle tasks: + * + * a) IDLE CPU available, we return immediately + * b) ACTIVE CPU where task fits and has the bigger maximum spare + * capacity (i.e. target_cpu) + * c) ACTIVE CPU with less contention due to other tasks + * (i.e. best_active_cpu) + * + * - NON prefer_idle tasks: + * + * a) ACTIVE CPU: target_cpu + * b) IDLE CPU: best_idle_cpu + */ + if (target_cpu == -1) + target_cpu = prefer_idle + ? best_active_cpu + : best_idle_cpu; + else + *backup_cpu = prefer_idle + ? best_active_cpu + : best_idle_cpu; + + trace_sched_find_best_target(p, prefer_idle, min_util, cpu, + best_idle_cpu, best_active_cpu, + target_cpu); + + schedstat_inc(p, se.statistics.nr_wakeups_fbt_count); + schedstat_inc(this_rq(), eas_stats.fbt_count); + + return target_cpu; +} + +/* + * Disable WAKE_AFFINE in the case where task @p doesn't fit in the + * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. + * + * In that case WAKE_AFFINE doesn't make sense and we'll let + * BALANCE_WAKE sort things out. + */ +static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) +{ + long min_cap, max_cap; + + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); + max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val; + + /* Minimum capacity is close to max, no need to abort wake_affine */ + if (max_cap - min_cap < max_cap >> 3) + return 0; + + /* Bring task utilization in sync with prev_cpu */ + sync_entity_load_avg(&p->se); + + return min_cap * 1024 < task_util(p) * capacity_margin; +} + +static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync) +{ + struct sched_domain *sd; + int target_cpu = prev_cpu, tmp_target, tmp_backup; + bool boosted, prefer_idle; + + schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts); + schedstat_inc(this_rq(), eas_stats.secb_attempts); + + if (sysctl_sched_sync_hint_enable && sync) { + int cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_sync); + schedstat_inc(this_rq(), eas_stats.secb_sync); + return cpu; + } + } + + rcu_read_lock(); #ifdef CONFIG_CGROUP_SCHEDTUNE - bool boosted = schedtune_task_boost(p) > 0; - bool prefer_idle = schedtune_prefer_idle(p) > 0; + boosted = schedtune_task_boost(p) > 0; + prefer_idle = schedtune_prefer_idle(p) > 0; #else - bool boosted = 0; - bool prefer_idle = 0; + boosted = get_sysctl_sched_cfs_boost() > 0; + prefer_idle = 0; #endif - int tmp_target = find_best_target(p, boosted, prefer_idle); - if (tmp_target >= 0) { - target_cpu = tmp_target; - if ((boosted || prefer_idle) && idle_cpu(target_cpu)) - return target_cpu; + + sync_entity_load_avg(&p->se); + + sd = rcu_dereference(per_cpu(sd_ea, prev_cpu)); + /* Find a cpu with sufficient capacity */ + tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle); + + if (!sd) + goto unlock; + if (tmp_target >= 0) { + target_cpu = tmp_target; + if ((boosted || prefer_idle) && idle_cpu(target_cpu)) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt); + schedstat_inc(this_rq(), eas_stats.secb_idle_bt); + goto unlock; } } - if (target_cpu != task_cpu(p)) { + if (target_cpu != prev_cpu) { + int delta = 0; struct energy_env eenv = { - .util_delta = task_util(p), - .src_cpu = task_cpu(p), - .dst_cpu = target_cpu, - .task = p, + .util_delta = task_util(p), + .src_cpu = prev_cpu, + .dst_cpu = target_cpu, + .task = p, + .trg_cpu = target_cpu, }; + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util && + p->state == TASK_WAKING) + delta = task_util(p); +#endif /* Not enough spare capacity on previous cpu */ - if (cpu_overutilized(task_cpu(p))) - return target_cpu; + if (__cpu_overutilized(prev_cpu, delta)) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap); + schedstat_inc(this_rq(), eas_stats.secb_insuff_cap); + goto unlock; + } + + if (energy_diff(&eenv) >= 0) { + /* No energy saving for target_cpu, try backup */ + target_cpu = tmp_backup; + eenv.dst_cpu = target_cpu; + eenv.trg_cpu = target_cpu; + if (tmp_backup < 0 || + tmp_backup == prev_cpu || + energy_diff(&eenv) >= 0) { + schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav); + schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav); + target_cpu = prev_cpu; + goto unlock; + } + } - if (energy_diff(&eenv) >= 0) - return task_cpu(p); + schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav); + schedstat_inc(this_rq(), eas_stats.secb_nrg_sav); + goto unlock; } + schedstat_inc(p, se.statistics.nr_wakeups_secb_count); + schedstat_inc(this_rq(), eas_stats.secb_count); + +unlock: + rcu_read_unlock(); + return target_cpu; } @@ -7050,7 +7897,8 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync) * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags, + int sibling_count_hint) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); @@ -7062,10 +7910,15 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f return select_best_cpu(p, prev_cpu, 0, sync); #endif - if (sd_flag & SD_BALANCE_WAKE) - want_affine = (!wake_wide(p) && task_fits_max(p, cpu) && - cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) || - energy_aware(); + if (sd_flag & SD_BALANCE_WAKE) { + record_wakee(p); + want_affine = !wake_wide(p, sibling_count_hint) && + !wake_cap(p, cpu, prev_cpu) && + cpumask_test_cpu(cpu, &p->cpus_allowed); + } + + if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized)) + return select_energy_cpu_brute(p, prev_cpu, sync); rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -7090,49 +7943,25 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (affine_sd) { sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) + if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync)) new_cpu = cpu; } - if (!sd) { - if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) - new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync); - else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ - new_cpu = select_idle_sibling(p, new_cpu); - - } else while (sd) { - struct sched_group *group; - int weight; - - if (!(sd->flags & sd_flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, p, cpu, sd_flag); - if (!group) { - sd = sd->child; - continue; - } + if (sd && !(sd_flag & SD_BALANCE_FORK)) { + /* + * We're going to need the task's util for capacity_spare_wake + * in find_idlest_group. Sync it up to prev_cpu's + * last_update_time. + */ + sync_entity_load_avg(&p->se); + } - new_cpu = find_idlest_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } + if (!sd) { + if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - /* Now try balancing at a lower domain level of new_cpu */ - cpu = new_cpu; - weight = sd->span_weight; - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= tmp->span_weight) - break; - if (tmp->flags & sd_flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ + } else { + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } rcu_read_unlock(); @@ -8083,10 +8912,6 @@ static void attach_one_task(struct rq *rq, struct task_struct *p) { raw_spin_lock(&rq->lock); attach_task(rq, p); - /* - * We want to potentially raise target_cpu's OPP. - */ - update_capacity_of(cpu_of(rq)); raw_spin_unlock(&rq->lock); } @@ -8108,11 +8933,6 @@ static void attach_tasks(struct lb_env *env) attach_task(env->dst_rq, p); } - /* - * We want to potentially raise env.dst_cpu's OPP. - */ - update_capacity_of(env->dst_cpu); - raw_spin_unlock(&env->dst_rq->lock); } @@ -8135,8 +8955,13 @@ static void update_blocked_averages(int cpu) if (throttled_hierarchy(cfs_rq)) continue; - if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, + true)) update_tg_load_avg(cfs_rq, 0); + + /* Propagate pending load changes to the parent */ + if (cfs_rq->tg->se[cpu]) + update_load_avg(cfs_rq->tg->se[cpu], 0); } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -8196,7 +9021,7 @@ static inline void update_blocked_averages(int cpu) raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); - update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); + update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -8434,13 +9259,14 @@ skip_unlock: __attribute__ ((unused)); cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; sdg->sgc->max_capacity = capacity; + sdg->sgc->min_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, max_capacity; + unsigned long capacity, max_capacity, min_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -8454,6 +9280,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity = 0; max_capacity = 0; + min_capacity = ULONG_MAX; if (child->flags & SD_OVERLAP) { /* @@ -8486,6 +9313,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } max_capacity = max(capacity, max_capacity); + min_capacity = min(capacity, min_capacity); } } else { /* @@ -8503,6 +9331,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) if (!cpu_isolated(cpumask_first(cpus))) { capacity += sgc->capacity; max_capacity = max(sgc->max_capacity, max_capacity); + min_capacity = min(sgc->min_capacity, min_capacity); } group = group->next; } while (group != child->groups); @@ -8510,6 +9339,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) sdg->sgc->capacity = capacity; sdg->sgc->max_capacity = max_capacity; + sdg->sgc->min_capacity = min_capacity; } /* @@ -8632,6 +9462,38 @@ group_type group_classify(struct sched_group *group, return group_other; } +#ifdef CONFIG_NO_HZ_COMMON +/* + * idle load balancing data + * - used by the nohz balance, but we want it available here + * so that we can see which CPUs have no tick. + */ +static struct { + cpumask_var_t idle_cpus_mask; + atomic_t nr_cpus; + unsigned long next_balance; /* in jiffy units */ +} nohz ____cacheline_aligned; + +static inline void update_cpu_stats_if_tickless(struct rq *rq) +{ + /* only called from update_sg_lb_stats when irqs are disabled */ + if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) { + /* rate limit updates to once-per-jiffie at most */ + if (READ_ONCE(jiffies) <= rq->last_load_update_tick) + return; + + raw_spin_lock(&rq->lock); + update_rq_clock(rq); + update_idle_cpu_load(rq); + update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false); + raw_spin_unlock(&rq->lock); + } +} + +#else +static inline void update_cpu_stats_if_tickless(struct rq *rq) { } +#endif + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -8663,6 +9525,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (cpu_isolated(i)) continue; + /* if we are entering idle and there are CPUs with + * their tick stopped, do an update for them + */ + if (env->idle == CPU_NEWLY_IDLE) + update_cpu_stats_if_tickless(rq); + /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -8791,15 +9659,21 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->avg_load <= busiest->avg_load) return false; + if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) + goto asym_packing; + /* - * Candiate sg has no more than one task per cpu and has higher - * per-cpu capacity. No reason to pull tasks to less capable cpus. + * Candidate sg has no more than one task per CPU and + * has higher per-CPU capacity. Migrating tasks to less + * capable CPUs may harm throughput. Maximize throughput, + * power/energy consequences are not considered. */ if (sgs->sum_nr_running <= sgs->group_weight && group_smaller_cpu_capacity(sds->local, sg)) return false; } +asym_packing: /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; @@ -8850,6 +9724,9 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) } #endif /* CONFIG_NUMA_BALANCING */ +#define lb_sd_parent(sd) \ + (sd->parent && sd->parent->groups != sd->parent->groups->next) + /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. @@ -8935,7 +9812,7 @@ next_group: env->src_grp_nr_running = sds->busiest_stat.sum_nr_running; - if (!env->sd->parent) { + if (!lb_sd_parent(env->sd)) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; @@ -9228,8 +10105,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + /* + * When dst_cpu is idle, prevent SMP nice and/or asymmetric group + * capacities from resulting in underutilization due to avg_load. + */ + if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && busiest->group_no_capacity) goto force_balance; @@ -9461,6 +10341,7 @@ static int need_active_balance(struct lb_env *env) if (energy_aware() && (capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && + ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) && env->src_rq->cfs.h_nr_running == 1 && cpu_overutilized(env->src_cpu) && !cpu_overutilized(env->dst_cpu)) { @@ -9524,7 +10405,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *continue_balancing) { int ld_moved = 0, cur_ld_moved, active_balance = 0; - struct sched_domain *sd_parent = sd->parent; + struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL; struct sched_group *group = NULL; struct rq *busiest = NULL; unsigned long flags; @@ -9597,6 +10478,7 @@ redo: more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); + update_rq_clock(busiest); /* The world might have changed. Validate assumptions */ if (busiest->nr_running <= 1) { @@ -9610,11 +10492,6 @@ more_balance: * ld_moved - cumulative load moved across iterations */ cur_ld_moved = detach_tasks(&env); - /* - * We want to potentially lower env.src_cpu's OPP. - */ - if (cur_ld_moved) - update_capacity_of(env.src_cpu); /* * We've detached some tasks from busiest_rq. Every @@ -9864,7 +10741,6 @@ static int idle_balance(struct rq *this_rq) struct sched_domain *sd; int pulled_task = 0; u64 curr_cost = 0; - long removed_util=0; if (cpu_isolated(this_cpu)) return 0; @@ -9891,17 +10767,6 @@ static int idle_balance(struct rq *this_rq) raw_spin_unlock(&this_rq->lock); - /* - * If removed_util_avg is !0 we most probably migrated some task away - * from this_cpu. In this case we might be willing to trigger an OPP - * update, but we want to do so if we don't find anybody else to pull - * here (we will trigger an OPP update with the pulled task's enqueue - * anyway). - * - * Record removed_util before calling update_blocked_averages, and use - * it below (before returning) to see if an OPP update is required. - */ - removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg); update_blocked_averages(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { @@ -9969,12 +10834,6 @@ out: if (pulled_task) { idle_exit_fair(this_rq); this_rq->idle_stamp = 0; - } else if (removed_util) { - /* - * No task pulled and someone has been migrated away. - * Good case to trigger an OPP update. - */ - update_capacity_of(this_cpu); } return pulled_task; @@ -9994,7 +10853,7 @@ static int active_load_balance_cpu_stop(void *data) struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd = NULL; struct task_struct *p = NULL; - struct task_struct *push_task; + struct task_struct *push_task = NULL; int push_task_detached = 0; struct lb_env env = { .sd = sd, @@ -10054,14 +10913,11 @@ static int active_load_balance_cpu_stop(void *data) if (likely(sd)) { env.sd = sd; schedstat_inc(sd, alb_count); + update_rq_clock(busiest_rq); p = detach_one_task(&env); if (p) { schedstat_inc(sd, alb_pushed); - /* - * We want to potentially lower env.src_cpu's OPP. - */ - update_capacity_of(env.src_cpu); moved = true; } else { schedstat_inc(sd, alb_failed); @@ -10114,11 +10970,6 @@ static inline int on_null_domain(struct rq *rq) * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. */ -static struct { - cpumask_var_t idle_cpus_mask; - atomic_t nr_cpus; - unsigned long next_balance; /* in jiffy units */ -} nohz ____cacheline_aligned; #ifdef CONFIG_SCHED_HMP static inline int find_new_hmp_ilb(int type) @@ -10545,6 +11396,10 @@ static inline int _nohz_kick_needed(struct rq *rq, int cpu, int *type) (!energy_aware() || cpu_overutilized(cpu))) return true; + /* Do idle load balance if there have misfit task */ + if (energy_aware()) + return rq->misfit_task; + return (rq->nr_running >= 2); } @@ -10585,7 +11440,7 @@ static inline bool nohz_kick_needed(struct rq *rq, int *type) #ifndef CONFIG_SCHED_HMP rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd && !energy_aware()) { + if (sd) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); @@ -10718,31 +11573,17 @@ static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; - int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock(&rq->lock); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - - /* - * Not only the cpu but also the task_group of the parent might have - * been changed after parent->se.parent,cfs_rq were copied to - * child->se.parent,cfs_rq. So call __set_task_cpu() to make those - * of child point to valid ones. - */ - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - - update_curr(cfs_rq); - - if (curr) + if (curr) { + update_curr(cfs_rq); se->vruntime = curr->vruntime; + } place_entity(cfs_rq, se, 1); if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { @@ -10755,8 +11596,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); } /* @@ -10808,6 +11648,61 @@ static inline bool vruntime_normalized(struct task_struct *p) return false; } +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Propagate the changes of the sched_entity across the tg tree to make it + * visible to the root + */ +static void propagate_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + + /* Start to propagate at parent */ + se = se->parent; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + if (cfs_rq_throttled(cfs_rq)) + break; + + update_load_avg(se, UPDATE_TG); + } +} +#else +static void propagate_entity_cfs_rq(struct sched_entity *se) { } +#endif + +static void detach_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* Catch up with the cfs_rq and remove our load when we leave */ + update_load_avg(se, 0); + detach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); +} + +static void attach_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* + * Since the real-depth could have been changed (only FAIR + * class maintain depth value), reset depth properly. + */ + se->depth = se->parent ? se->parent->depth + 1 : 0; +#endif + + /* Synchronize entity with its cfs_rq */ + update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); + attach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq, false); + propagate_entity_cfs_rq(se); +} + static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; @@ -10822,8 +11717,7 @@ static void detach_task_cfs_rq(struct task_struct *p) se->vruntime -= cfs_rq->min_vruntime; } - /* Catch up with the cfs_rq and remove our load when we leave */ - detach_entity_load_avg(cfs_rq, se); + detach_entity_cfs_rq(se); } static void attach_task_cfs_rq(struct task_struct *p) @@ -10831,16 +11725,7 @@ static void attach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_FAIR_GROUP_SCHED - /* - * Since the real-depth could have been changed (only FAIR - * class maintain depth value), reset depth properly. - */ - se->depth = se->parent ? se->parent->depth + 1 : 0; -#endif - - /* Synchronize task with its cfs_rq */ - attach_entity_load_avg(cfs_rq, se); + attach_entity_cfs_rq(se); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; @@ -10894,12 +11779,23 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; #endif #ifdef CONFIG_SMP +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->propagate_avg = 0; +#endif atomic_long_set(&cfs_rq->removed_load_avg, 0); atomic_long_set(&cfs_rq->removed_util_avg, 0); #endif } #ifdef CONFIG_FAIR_GROUP_SCHED +static void task_set_group_fair(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + + set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; +} + static void task_move_group_fair(struct task_struct *p) { detach_task_cfs_rq(p); @@ -10912,6 +11808,19 @@ static void task_move_group_fair(struct task_struct *p) attach_task_cfs_rq(p); } +static void task_change_group_fair(struct task_struct *p, int type) +{ + switch (type) { + case TASK_SET_GROUP: + task_set_group_fair(p); + break; + + case TASK_MOVE_GROUP: + task_move_group_fair(p); + break; + } +} + void free_fair_sched_group(struct task_group *tg) { int i; @@ -10931,8 +11840,9 @@ void free_fair_sched_group(struct task_group *tg) int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { - struct cfs_rq *cfs_rq; struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -10947,6 +11857,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) @@ -10960,6 +11872,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); + + raw_spin_lock_irq(&rq->lock); + post_init_entity_util_avg(se); + raw_spin_unlock_irq(&rq->lock); } return 1; @@ -11056,8 +11972,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) /* Possible calls to update_curr() need rq clock */ update_rq_clock(rq); - for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se)); + for_each_sched_entity(se) { + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -11134,7 +12052,7 @@ const struct sched_class fair_sched_class = { .update_curr = update_curr_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .task_move_group = task_move_group_fair, + .task_change_group = task_change_group_fair, #endif #ifdef CONFIG_SCHED_HMP .inc_hmp_sched_stats = inc_hmp_sched_stats_fair, diff --git a/kernel/sched/hmp.c b/kernel/sched/hmp.c index ae6876e62c0f..ea066ab8376b 100644 --- a/kernel/sched/hmp.c +++ b/kernel/sched/hmp.c @@ -1526,6 +1526,10 @@ unsigned int cpu_temp(int cpu) return 0; } +/* + * kfree() may wakeup kswapd. So this function should NOT be called + * with any CPU's rq->lock acquired. + */ void free_task_load_ptrs(struct task_struct *p) { kfree(p->ravg.curr_window_cpu); @@ -2608,7 +2612,8 @@ update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event, p->cpu_cycles = cur_cycles; - trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time); + trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, + rq->cc.time, p); } static int diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 36c6634236fb..d562efb04775 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -9,7 +9,8 @@ #ifdef CONFIG_SMP static int -select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index b0b93fd33af9..f8e8d68ed3fd 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -201,8 +201,9 @@ void calc_load_exit_idle(void) struct rq *this_rq = this_rq(); /* - * If we're still before the sample window, we're done. + * If we're still before the pending sample window, we're done. */ + this_rq->calc_load_update = calc_load_update; if (time_before(jiffies, this_rq->calc_load_update)) return; @@ -211,7 +212,6 @@ void calc_load_exit_idle(void) * accounted through the nohz accounting, so skip the entire deal and * sync up for the next window. */ - this_rq->calc_load_update = calc_load_update; if (time_before(jiffies, this_rq->calc_load_update + 10)) this_rq->calc_load_update += LOAD_FREQ; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 29345ed74069..05d635c2beec 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -5,9 +5,13 @@ #include "sched.h" +#include <linux/interrupt.h> #include <linux/slab.h> #include <linux/irq_work.h> #include <trace/events/sched.h> +#include <linux/hrtimer.h> + +#include "tune.h" int sched_rr_timeslice = RR_TIMESLICE; @@ -65,10 +69,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } -#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI) -static void push_irq_work_func(struct irq_work *work); -#endif - void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; @@ -88,13 +88,6 @@ void init_rt_rq(struct rt_rq *rt_rq) rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); - -#ifdef HAVE_RT_PUSH_IPI - rt_rq->push_flags = 0; - rt_rq->push_cpu = nr_cpu_ids; - raw_spin_lock_init(&rt_rq->push_lock); - init_irq_work(&rt_rq->push_work, push_irq_work_func); -#endif #endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -988,6 +981,70 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; } +#define RT_SCHEDTUNE_INTERVAL 50000000ULL + +static enum hrtimer_restart rt_schedtune_timer(struct hrtimer *timer) +{ + struct sched_rt_entity *rt_se = container_of(timer, + struct sched_rt_entity, + schedtune_timer); + struct task_struct *p = rt_task_of(rt_se); + struct rq *rq = task_rq(p); + + raw_spin_lock(&rq->lock); + + /* + * Nothing to do if: + * - task has switched runqueues + * - task isn't RT anymore + */ + if (rq != task_rq(p) || (p->sched_class != &rt_sched_class)) + goto out; + + /* + * If task got enqueued back during callback time, it means we raced + * with the enqueue on another cpu, that's Ok, just do nothing as + * enqueue path would have tried to cancel us and we shouldn't run + * Also check the schedtune_enqueued flag as class-switch on a + * sleeping task may have already canceled the timer and done dq + */ + if (p->on_rq || !rt_se->schedtune_enqueued) + goto out; + + /* + * RT task is no longer active, cancel boost + */ + rt_se->schedtune_enqueued = false; + schedtune_dequeue_task(p, cpu_of(rq)); + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); +out: + raw_spin_unlock(&rq->lock); + + /* + * This can free the task_struct if no more references. + */ + put_task_struct(p); + + return HRTIMER_NORESTART; +} + +void init_rt_schedtune_timer(struct sched_rt_entity *rt_se) +{ + struct hrtimer *timer = &rt_se->schedtune_timer; + + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + timer->function = rt_schedtune_timer; + rt_se->schedtune_enqueued = false; +} + +static void start_schedtune_timer(struct sched_rt_entity *rt_se) +{ + struct hrtimer *timer = &rt_se->schedtune_timer; + + hrtimer_start(timer, ns_to_ktime(RT_SCHEDTUNE_INTERVAL), + HRTIMER_MODE_REL_PINNED); +} + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -1005,6 +1062,9 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec <= 0)) return; + /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -1390,6 +1450,33 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); + + if (!schedtune_task_boost(p)) + return; + + /* + * If schedtune timer is active, that means a boost was already + * done, just cancel the timer so that deboost doesn't happen. + * Otherwise, increase the boost. If an enqueued timer was + * cancelled, put the task reference. + */ + if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1) + put_task_struct(p); + + /* + * schedtune_enqueued can be true in the following situation: + * enqueue_task_rt grabs rq lock before timer fires + * or before its callback acquires rq lock + * schedtune_enqueued can be false if timer callback is running + * and timer just released rq lock, or if the timer finished + * running and canceling the boost + */ + if (rt_se->schedtune_enqueued) + return; + + rt_se->schedtune_enqueued = true; + schedtune_enqueue_task(p, cpu_of(rq)); + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) @@ -1401,6 +1488,19 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) dec_hmp_sched_stats_rt(rq, p); dequeue_pushable_task(rq, p); + + if (!rt_se->schedtune_enqueued) + return; + + if (flags == DEQUEUE_SLEEP) { + get_task_struct(p); + start_schedtune_timer(rt_se); + return; + } + + rt_se->schedtune_enqueued = false; + schedtune_dequeue_task(p, cpu_of(rq)); + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); } /* @@ -1456,11 +1556,57 @@ select_task_rq_rt_hmp(struct task_struct *p, int cpu, int sd_flag, int flags) } #endif +/* + * Return whether the task on the given cpu is currently non-preemptible + * while handling a potentially long softint, or if the task is likely + * to block preemptions soon because it is a ksoftirq thread that is + * handling slow softints. + */ +bool +task_may_not_preempt(struct task_struct *task, int cpu) +{ + __u32 softirqs = per_cpu(active_softirqs, cpu) | + __IRQ_STAT(cpu, __softirq_pending); + struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu); + + return ((softirqs & LONG_SOFTIRQ_MASK) && + (task == cpu_ksoftirqd || + task_thread_info(task)->preempt_count & SOFTIRQ_MASK)); +} + +/* + * Perform a schedtune dequeue and cancelation of boost timers if needed. + * Should be called only with the rq->lock held. + */ +static void schedtune_dequeue_rt(struct rq *rq, struct task_struct *p) +{ + struct sched_rt_entity *rt_se = &p->rt; + + BUG_ON(!raw_spin_is_locked(&rq->lock)); + + if (!rt_se->schedtune_enqueued) + return; + + /* + * Incase of class change cancel any active timers. If an enqueued + * timer was cancelled, put the task ref. + */ + if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1) + put_task_struct(p); + + /* schedtune_enqueued is true, deboost it */ + rt_se->schedtune_enqueued = false; + schedtune_dequeue_task(p, task_cpu(p)); + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); +} + static int -select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { struct task_struct *curr; struct rq *rq; + bool may_not_preempt; #ifdef CONFIG_SCHED_HMP return select_task_rq_rt_hmp(p, cpu, sd_flag, flags); @@ -1476,7 +1622,17 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) curr = READ_ONCE(rq->curr); /* unlocked access */ /* - * If the current task on @p's runqueue is an RT task, then + * If the current task on @p's runqueue is a softirq task, + * it may run without preemption for a time that is + * ill-suited for a waiting RT task. Therefore, try to + * wake this RT task on another runqueue. + * + * Also, if the current task on @p's runqueue is an RT task, then + * it may run without preemption for a time that is + * ill-suited for a waiting RT task. Therefore, try to + * wake this RT task on another runqueue. + * + * Also, if the current task on @p's runqueue is an RT task, then * try to see if we can wake this RT task up on another * runqueue. Otherwise simply start this RT task * on its current runqueue. @@ -1497,22 +1653,40 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) * This test is optimistic, if we get it wrong the load-balancer * will have to sort it out. */ - if (curr && unlikely(rt_task(curr)) && + may_not_preempt = task_may_not_preempt(curr, cpu); + if (may_not_preempt || + (unlikely(rt_task(curr)) && (curr->nr_cpus_allowed < 2 || - curr->prio <= p->prio)) { + curr->prio <= p->prio))) { int target = find_lowest_rq(p); /* - * Don't bother moving it if the destination CPU is - * not running a lower priority task. + * If cpu is non-preemptible, prefer remote cpu + * even if it's running a higher-prio task. + * Otherwise: Don't bother moving it if the + * destination CPU is not running a lower priority task. */ if (target != -1 && - p->prio < cpu_rq(target)->rt.highest_prio.curr) + (may_not_preempt || + p->prio < cpu_rq(target)->rt.highest_prio.curr)) cpu = target; } rcu_read_unlock(); out: + /* + * If previous CPU was different, make sure to cancel any active + * schedtune timers and deboost. + */ + if (task_cpu(p) != cpu) { + unsigned long fl; + struct rq *prq = task_rq(p); + + raw_spin_lock_irqsave(&prq->lock, fl); + schedtune_dequeue_rt(prq, p); + raw_spin_unlock_irqrestore(&prq->lock, fl); + } + return cpu; } @@ -1573,41 +1747,6 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag #endif } -#if defined(CONFIG_SMP) && defined(CONFIG_CPU_FREQ_GOV_SCHED) -static void sched_rt_update_capacity_req(struct rq *rq) -{ - u64 total, used, age_stamp, avg; - s64 delta; - - if (!sched_freq()) - return; - - sched_avg_update(rq); - /* - * Since we're reading these variables without serialization make sure - * we read them once before doing sanity checks on them. - */ - age_stamp = READ_ONCE(rq->age_stamp); - avg = READ_ONCE(rq->rt_avg); - delta = rq_clock(rq) - age_stamp; - - if (unlikely(delta < 0)) - delta = 0; - - total = sched_avg_period() + delta; - - used = div_u64(avg, total); - if (unlikely(used > SCHED_CAPACITY_SCALE)) - used = SCHED_CAPACITY_SCALE; - - set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used)); -} -#else -static inline void sched_rt_update_capacity_req(struct rq *rq) -{ } - -#endif - static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, struct rt_rq *rt_rq) { @@ -1676,17 +1815,8 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) if (prev->sched_class == &rt_sched_class) update_curr_rt(rq); - if (!rt_rq->rt_queued) { - /* - * The next task to be picked on this rq will have a lower - * priority than rt tasks so we can spend some time to update - * the capacity used by rt tasks based on the last activity. - * This value will be the used as an estimation of the next - * activity. - */ - sched_rt_update_capacity_req(rq); + if (!rt_rq->rt_queued) return NULL; - } put_prev_task(rq, prev); @@ -1785,6 +1915,7 @@ static int find_lowest_rq_hmp(struct task_struct *task) * the best one based on our affinity and topology. */ +retry: for_each_sched_cluster(cluster) { if (boost_on_big && cluster->capacity != max_possible_capacity) continue; @@ -1792,6 +1923,15 @@ static int find_lowest_rq_hmp(struct task_struct *task) cpumask_and(&candidate_mask, &cluster->cpus, lowest_mask); cpumask_andnot(&candidate_mask, &candidate_mask, cpu_isolated_mask); + /* + * When placement boost is active, if there is no eligible CPU + * in the highest capacity cluster, we fallback to the other + * clusters. So clear the CPUs of the traversed cluster from + * the lowest_mask. + */ + if (unlikely(boost_on_big)) + cpumask_andnot(lowest_mask, lowest_mask, + &cluster->cpus); if (cpumask_empty(&candidate_mask)) continue; @@ -1831,6 +1971,11 @@ static int find_lowest_rq_hmp(struct task_struct *task) break; } + if (unlikely(boost_on_big && best_cpu == -1)) { + boost_on_big = 0; + goto retry; + } + return best_cpu; } #endif /* CONFIG_SCHED_HMP */ @@ -2064,7 +2209,9 @@ retry: next_task->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(rq, next_task, 0); + next_task->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(next_task, lowest_rq->cpu); + next_task->on_rq = TASK_ON_RQ_QUEUED; activate_task(lowest_rq, next_task, 0); next_task->on_rq = TASK_ON_RQ_QUEUED; ret = 1; @@ -2087,160 +2234,172 @@ static void push_rt_tasks(struct rq *rq) } #ifdef HAVE_RT_PUSH_IPI + /* - * The search for the next cpu always starts at rq->cpu and ends - * when we reach rq->cpu again. It will never return rq->cpu. - * This returns the next cpu to check, or nr_cpu_ids if the loop - * is complete. + * When a high priority task schedules out from a CPU and a lower priority + * task is scheduled in, a check is made to see if there's any RT tasks + * on other CPUs that are waiting to run because a higher priority RT task + * is currently running on its CPU. In this case, the CPU with multiple RT + * tasks queued on it (overloaded) needs to be notified that a CPU has opened + * up that may be able to run one of its non-running queued RT tasks. + * + * All CPUs with overloaded RT tasks need to be notified as there is currently + * no way to know which of these CPUs have the highest priority task waiting + * to run. Instead of trying to take a spinlock on each of these CPUs, + * which has shown to cause large latency when done on machines with many + * CPUs, sending an IPI to the CPUs to have them push off the overloaded + * RT tasks waiting to run. + * + * Just sending an IPI to each of the CPUs is also an issue, as on large + * count CPU machines, this can cause an IPI storm on a CPU, especially + * if its the only CPU with multiple RT tasks queued, and a large number + * of CPUs scheduling a lower priority task at the same time. + * + * Each root domain has its own irq work function that can iterate over + * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT + * tassk must be checked if there's one or many CPUs that are lowering + * their priority, there's a single irq work iterator that will try to + * push off RT tasks that are waiting to run. + * + * When a CPU schedules a lower priority task, it will kick off the + * irq work iterator that will jump to each CPU with overloaded RT tasks. + * As it only takes the first CPU that schedules a lower priority task + * to start the process, the rto_start variable is incremented and if + * the atomic result is one, then that CPU will try to take the rto_lock. + * This prevents high contention on the lock as the process handles all + * CPUs scheduling lower priority tasks. + * + * All CPUs that are scheduling a lower priority task will increment the + * rt_loop_next variable. This will make sure that the irq work iterator + * checks all RT overloaded CPUs whenever a CPU schedules a new lower + * priority task, even if the iterator is in the middle of a scan. Incrementing + * the rt_loop_next will cause the iterator to perform another scan. * - * rq->rt.push_cpu holds the last cpu returned by this function, - * or if this is the first instance, it must hold rq->cpu. */ -static int rto_next_cpu(struct rq *rq) +static int rto_next_cpu(struct root_domain *rd) { - int prev_cpu = rq->rt.push_cpu; + int next; int cpu; - cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); - /* - * If the previous cpu is less than the rq's CPU, then it already - * passed the end of the mask, and has started from the beginning. - * We end if the next CPU is greater or equal to rq's CPU. + * When starting the IPI RT pushing, the rto_cpu is set to -1, + * rt_next_cpu() will simply return the first CPU found in + * the rto_mask. + * + * If rto_next_cpu() is called with rto_cpu is a valid cpu, it + * will return the next CPU found in the rto_mask. + * + * If there are no more CPUs left in the rto_mask, then a check is made + * against rto_loop and rto_loop_next. rto_loop is only updated with + * the rto_lock held, but any CPU may increment the rto_loop_next + * without any locking. */ - if (prev_cpu < rq->cpu) { - if (cpu >= rq->cpu) - return nr_cpu_ids; + for (;;) { - } else if (cpu >= nr_cpu_ids) { - /* - * We passed the end of the mask, start at the beginning. - * If the result is greater or equal to the rq's CPU, then - * the loop is finished. - */ - cpu = cpumask_first(rq->rd->rto_mask); - if (cpu >= rq->cpu) - return nr_cpu_ids; - } - rq->rt.push_cpu = cpu; + /* When rto_cpu is -1 this acts like cpumask_first() */ + cpu = cpumask_next(rd->rto_cpu, rd->rto_mask); - /* Return cpu to let the caller know if the loop is finished or not */ - return cpu; -} + rd->rto_cpu = cpu; -static int find_next_push_cpu(struct rq *rq) -{ - struct rq *next_rq; - int cpu; + if (cpu < nr_cpu_ids) + return cpu; - while (1) { - cpu = rto_next_cpu(rq); - if (cpu >= nr_cpu_ids) - break; - next_rq = cpu_rq(cpu); + rd->rto_cpu = -1; - /* Make sure the next rq can push to this rq */ - if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) + /* + * ACQUIRE ensures we see the @rto_mask changes + * made prior to the @next value observed. + * + * Matches WMB in rt_set_overload(). + */ + next = atomic_read_acquire(&rd->rto_loop_next); + + if (rd->rto_loop == next) break; + + rd->rto_loop = next; } - return cpu; + return -1; +} + +static inline bool rto_start_trylock(atomic_t *v) +{ + return !atomic_cmpxchg_acquire(v, 0, 1); } -#define RT_PUSH_IPI_EXECUTING 1 -#define RT_PUSH_IPI_RESTART 2 +static inline void rto_start_unlock(atomic_t *v) +{ + atomic_set_release(v, 0); +} static void tell_cpu_to_push(struct rq *rq) { - int cpu; - - if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { - raw_spin_lock(&rq->rt.push_lock); - /* Make sure it's still executing */ - if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { - /* - * Tell the IPI to restart the loop as things have - * changed since it started. - */ - rq->rt.push_flags |= RT_PUSH_IPI_RESTART; - raw_spin_unlock(&rq->rt.push_lock); - return; - } - raw_spin_unlock(&rq->rt.push_lock); - } + int cpu = -1; - /* When here, there's no IPI going around */ + /* Keep the loop going if the IPI is currently active */ + atomic_inc(&rq->rd->rto_loop_next); - rq->rt.push_cpu = rq->cpu; - cpu = find_next_push_cpu(rq); - if (cpu >= nr_cpu_ids) + /* Only one CPU can initiate a loop at a time */ + if (!rto_start_trylock(&rq->rd->rto_loop_start)) return; - rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; + raw_spin_lock(&rq->rd->rto_lock); - irq_work_queue_on(&rq->rt.push_work, cpu); + /* + * The rto_cpu is updated under the lock, if it has a valid cpu + * then the IPI is still running and will continue due to the + * update to loop_next, and nothing needs to be done here. + * Otherwise it is finishing up and an ipi needs to be sent. + */ + if (rq->rd->rto_cpu < 0) + cpu = rto_next_cpu(rq->rd); + + raw_spin_unlock(&rq->rd->rto_lock); + + rto_start_unlock(&rq->rd->rto_loop_start); + + if (cpu >= 0) { + /* Make sure the rd does not get freed while pushing */ + sched_get_rd(rq->rd); + irq_work_queue_on(&rq->rd->rto_push_work, cpu); + } } /* Called from hardirq context */ -static void try_to_push_tasks(void *arg) +void rto_push_irq_work_func(struct irq_work *work) { - struct rt_rq *rt_rq = arg; - struct rq *rq, *src_rq; - int this_cpu; + struct root_domain *rd = + container_of(work, struct root_domain, rto_push_work); + struct rq *rq; int cpu; - this_cpu = rt_rq->push_cpu; - - /* Paranoid check */ - BUG_ON(this_cpu != smp_processor_id()); - - rq = cpu_rq(this_cpu); - src_rq = rq_of_rt_rq(rt_rq); + rq = this_rq(); -again: + /* + * We do not need to grab the lock to check for has_pushable_tasks. + * When it gets updated, a check is made if a push is possible. + */ if (has_pushable_tasks(rq)) { raw_spin_lock(&rq->lock); - push_rt_task(rq); + push_rt_tasks(rq); raw_spin_unlock(&rq->lock); } - /* Pass the IPI to the next rt overloaded queue */ - raw_spin_lock(&rt_rq->push_lock); - /* - * If the source queue changed since the IPI went out, - * we need to restart the search from that CPU again. - */ - if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { - rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; - rt_rq->push_cpu = src_rq->cpu; - } + raw_spin_lock(&rd->rto_lock); - cpu = find_next_push_cpu(src_rq); + /* Pass the IPI to the next rt overloaded queue */ + cpu = rto_next_cpu(rd); - if (cpu >= nr_cpu_ids) - rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; - raw_spin_unlock(&rt_rq->push_lock); + raw_spin_unlock(&rd->rto_lock); - if (cpu >= nr_cpu_ids) + if (cpu < 0) { + sched_put_rd(rd); return; - - /* - * It is possible that a restart caused this CPU to be - * chosen again. Don't bother with an IPI, just see if we - * have more to push. - */ - if (unlikely(cpu == rq->cpu)) - goto again; + } /* Try the next RT overloaded CPU */ - irq_work_queue_on(&rt_rq->push_work, cpu); -} - -static void push_irq_work_func(struct irq_work *work) -{ - struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); - - try_to_push_tasks(rt_rq); + irq_work_queue_on(&rd->rto_push_work, cpu); } #endif /* HAVE_RT_PUSH_IPI */ @@ -2250,8 +2409,9 @@ static void pull_rt_task(struct rq *this_rq) bool resched = false; struct task_struct *p; struct rq *src_rq; + int rt_overload_count = rt_overloaded(this_rq); - if (likely(!rt_overloaded(this_rq))) + if (likely(!rt_overload_count)) return; /* @@ -2260,6 +2420,11 @@ static void pull_rt_task(struct rq *this_rq) */ smp_rmb(); + /* If we are the only overloaded CPU do nothing */ + if (rt_overload_count == 1 && + cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask)) + return; + #ifdef HAVE_RT_PUSH_IPI if (sched_feat(RT_PUSH_IPI)) { tell_cpu_to_push(this_rq); @@ -2320,7 +2485,9 @@ static void pull_rt_task(struct rq *this_rq) p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(p, this_cpu); + p->on_rq = TASK_ON_RQ_QUEUED; activate_task(this_rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; /* @@ -2382,6 +2549,13 @@ static void rq_offline_rt(struct rq *rq) static void switched_from_rt(struct rq *rq, struct task_struct *p) { /* + * On class switch from rt, always cancel active schedtune timers, + * this handles the cases where we switch class for a task that is + * already rt-dequeued but has a running timer. + */ + schedtune_dequeue_rt(rq, p); + + /* * If there are other RT tasks then we will reschedule * and the scheduling of the other RT tasks will handle * the balancing. But if we are the last RT task @@ -2500,9 +2674,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) update_curr_rt(rq); - if (rq->rt.rt_nr_running) - sched_rt_update_capacity_req(rq); - watchdog(rq, p); /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b88f647ea935..b6cd12998f16 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -340,7 +340,15 @@ extern void sched_move_task(struct task_struct *tsk); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); -#endif + +#ifdef CONFIG_SMP +extern void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next); +#else /* !CONFIG_SMP */ +static inline void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) { } +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ extern struct task_group *css_tg(struct cgroup_subsys_state *css); #else /* CONFIG_CGROUP_SCHED */ @@ -465,6 +473,7 @@ struct cfs_rq { unsigned long runnable_load_avg; #ifdef CONFIG_FAIR_GROUP_SCHED unsigned long tg_load_avg_contrib; + unsigned long propagate_avg; #endif atomic_long_t removed_load_avg, removed_util_avg; #ifndef CONFIG_64BIT @@ -523,7 +532,7 @@ static inline int rt_bandwidth_enabled(void) } /* RT IPI pull logic requires IRQ_WORK */ -#ifdef CONFIG_IRQ_WORK +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP) # define HAVE_RT_PUSH_IPI #endif @@ -544,12 +553,6 @@ struct rt_rq { unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; -#ifdef HAVE_RT_PUSH_IPI - int push_flags; - int push_cpu; - struct irq_work push_work; - raw_spinlock_t push_lock; -#endif #endif /* CONFIG_SMP */ int rt_queued; @@ -642,6 +645,19 @@ struct root_domain { struct dl_bw dl_bw; struct cpudl cpudl; +#ifdef HAVE_RT_PUSH_IPI + /* + * For IPI pull requests, loop across the rto_mask. + */ + struct irq_work rto_push_work; + raw_spinlock_t rto_lock; + /* These are only updated and read within rto_lock */ + int rto_loop; + int rto_cpu; + /* These atomics are updated outside of a lock */ + atomic_t rto_loop_next; + atomic_t rto_loop_start; +#endif /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. @@ -651,10 +667,18 @@ struct root_domain { /* Maximum cpu capacity in the system. */ struct max_cpu_capacity max_cpu_capacity; + + /* First cpu with maximum and minimum original capacity */ + int max_cap_orig_cpu, min_cap_orig_cpu; }; extern struct root_domain def_root_domain; +extern void sched_get_rd(struct root_domain *rd); +extern void sched_put_rd(struct root_domain *rd); +#ifdef HAVE_RT_PUSH_IPI +extern void rto_push_irq_work_func(struct irq_work *work); +#endif #endif /* CONFIG_SMP */ /* @@ -708,6 +732,7 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; + struct list_head *tmp_alone_branch; #endif /* CONFIG_FAIR_GROUP_SCHED */ /* @@ -789,6 +814,19 @@ struct rq { int curr_top; #endif +#ifdef CONFIG_SCHED_WALT + u64 cumulative_runnable_avg; + u64 window_start; + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; + u64 cur_irqload; + u64 avg_irqload; + u64 irqload_ts; + u64 cum_window_demand; +#endif /* CONFIG_SCHED_WALT */ + #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif @@ -827,6 +865,9 @@ struct rq { /* try_to_wake_up() stats */ unsigned int ttwu_count; unsigned int ttwu_local; +#ifdef CONFIG_SMP + struct eas_stats eas_stats; +#endif #endif #ifdef CONFIG_SMP @@ -997,6 +1038,7 @@ struct sched_group_capacity { */ unsigned long capacity; unsigned long max_capacity; /* Max per-cpu capacity in group */ + unsigned long min_capacity; /* Min per-CPU capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ /* @@ -1217,7 +1259,7 @@ static inline int cpu_min_power_cost(int cpu) return cpu_rq(cpu)->cluster->min_power_cost; } -static inline u32 cpu_cycles_to_freq(u64 cycles, u32 period) +static inline u32 cpu_cycles_to_freq(u64 cycles, u64 period) { return div64_u64(cycles, period); } @@ -1232,6 +1274,11 @@ static inline bool is_max_capacity_cpu(int cpu) return cpu_max_possible_capacity(cpu) == max_possible_capacity; } +static inline bool is_min_capacity_cpu(int cpu) +{ + return cpu_max_possible_capacity(cpu) == min_max_possible_capacity; +} + /* * 'load' is in reference to "best cpu" at its best frequency. * Scale that in reference to a given cpu, accounting for how bad it is @@ -1729,6 +1776,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_FAIR_GROUP_SCHED + set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); p->se.cfs_rq = tg->cfs_rq[cpu]; p->se.parent = tg->se[cpu]; #endif @@ -1758,7 +1806,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * per-task data have been completed by this moment. */ smp_wmb(); +#ifdef CONFIG_THREAD_INFO_IN_TASK + p->cpu = cpu; +#else task_thread_info(p)->cpu = cpu; +#endif p->wake_cpu = cpu; #endif } @@ -2011,7 +2063,8 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags, + int subling_count_hint); void (*migrate_task_rq)(struct task_struct *p); void (*task_waking) (struct task_struct *task); @@ -2044,8 +2097,11 @@ struct sched_class { void (*update_curr) (struct rq *rq); +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 + #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_move_group) (struct task_struct *p); + void (*task_change_group)(struct task_struct *p, int type); #endif #ifdef CONFIG_SCHED_HMP void (*inc_hmp_sched_stats)(struct rq *rq, struct task_struct *p); @@ -2150,6 +2206,7 @@ extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); +extern void init_rt_schedtune_timer(struct sched_rt_entity *rt_se); extern struct dl_bandwidth def_dl_bandwidth; extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); @@ -2158,6 +2215,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); +extern void post_init_entity_util_avg(struct sched_entity *se); static inline void __add_nr_running(struct rq *rq, unsigned count) { @@ -2316,7 +2374,7 @@ static inline unsigned long capacity_orig_of(int cpu) extern unsigned int sysctl_sched_use_walt_cpu_util; extern unsigned int walt_ravg_window; -extern unsigned int walt_disabled; +extern bool walt_disabled; /* * cpu_util returns the amount of capacity of a CPU that is used by CFS @@ -2349,6 +2407,12 @@ static inline unsigned long __cpu_util(int cpu, int delta) unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + util = div64_u64(cpu_rq(cpu)->cumulative_runnable_avg, + walt_ravg_window >> SCHED_LOAD_SHIFT); +#endif + delta += util; if (delta < 0) return 0; @@ -2361,60 +2425,19 @@ static inline unsigned long cpu_util(int cpu) return __cpu_util(cpu, 0); } -#endif - -#ifdef CONFIG_CPU_FREQ_GOV_SCHED -#define capacity_max SCHED_CAPACITY_SCALE -extern unsigned int capacity_margin; -extern struct static_key __sched_freq; - -static inline bool sched_freq(void) +static inline unsigned long cpu_util_freq(int cpu) { - return static_key_false(&__sched_freq); -} - -DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); -void update_cpu_capacity_request(int cpu, bool request); - -static inline void set_cfs_cpu_capacity(int cpu, bool request, - unsigned long capacity) -{ - struct sched_capacity_reqs *scr = &per_cpu(cpu_sched_capacity_reqs, cpu); - - if (scr->cfs != capacity) { - scr->cfs = capacity; - update_cpu_capacity_request(cpu, request); - } -} + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long capacity = capacity_orig_of(cpu); -static inline void set_rt_cpu_capacity(int cpu, bool request, - unsigned long capacity) -{ - if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) { - per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity; - update_cpu_capacity_request(cpu, request); - } +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + util = div64_u64(cpu_rq(cpu)->prev_runnable_sum, + walt_ravg_window >> SCHED_LOAD_SHIFT); +#endif + return (util >= capacity) ? capacity : util; } -static inline void set_dl_cpu_capacity(int cpu, bool request, - unsigned long capacity) -{ - if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) { - per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity; - update_cpu_capacity_request(cpu, request); - } -} -#else -#define sched_freq() false -static inline void set_cfs_cpu_capacity(int cpu, bool request, - unsigned long capacity) -{ } -static inline void set_rt_cpu_capacity(int cpu, bool request, - unsigned long capacity) -{ } -static inline void set_dl_cpu_capacity(int cpu, bool request, - unsigned long capacity) -{ } #endif #ifdef CONFIG_SCHED_HMP @@ -2671,6 +2694,11 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __release(rq2->lock); } +/* + * task_may_not_preempt - check whether a task may not be preemptible soon + */ +extern bool task_may_not_preempt(struct task_struct *task, int cpu); + #else /* CONFIG_SMP */ /* @@ -2792,3 +2820,66 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @rq: Runqueue to carry out the update for. + * @flags: Update reason flags. + * + * This function is called by the scheduler on the CPU whose utilization is + * being updated. + * + * It can only be called from RCU-sched read-side critical sections. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS, + * though, because they may not be coming in if RT or deadline tasks are active + * all the time (or there are RT and DL tasks only). + * + * As a workaround for that issue, this function is called by the RT and DL + * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT and DL tasks. + */ +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + if (data) + data->func(data, rq_clock(rq), flags); +} + +static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) +{ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_update_util(rq, flags); +} +#else +static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} +static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} +#endif /* CONFIG_CPU_FREQ */ + +#ifdef CONFIG_SCHED_WALT + +static inline bool +walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p) +{ + return cpu_of(rq) == task_cpu(p) && + (p->on_rq || p->last_sleep_ts >= rq->window_start); +} + +#endif /* CONFIG_SCHED_WALT */ + +#ifdef arch_scale_freq_capacity +#ifndef arch_scale_freq_invariant +#define arch_scale_freq_invariant() (true) +#endif +#else /* arch_scale_freq_capacity */ +#define arch_scale_freq_invariant() (false) +#endif diff --git a/kernel/sched/sched_avg.c b/kernel/sched/sched_avg.c index ba5a326a9fd8..f03ed685f102 100644 --- a/kernel/sched/sched_avg.c +++ b/kernel/sched/sched_avg.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2012, 2015-2017, The Linux Foundation. All rights reserved. +/* Copyright (c) 2012, 2015-2017, 2018 The Linux Foundation. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and @@ -119,6 +119,43 @@ void sched_get_nr_running_avg(int *avg, int *iowait_avg, int *big_avg, } EXPORT_SYMBOL(sched_get_nr_running_avg); +static DEFINE_PER_CPU(atomic64_t, last_busy_time) = ATOMIC64_INIT(0); + +#define BUSY_NR_RUN 3 +#define BUSY_LOAD_FACTOR 10 + +#ifdef CONFIG_SCHED_HMP +static inline void update_last_busy_time(int cpu, bool dequeue, + unsigned long prev_nr_run, u64 curr_time) +{ + bool nr_run_trigger = false, load_trigger = false; + + if (!hmp_capable() || is_min_capacity_cpu(cpu)) + return; + + if (prev_nr_run >= BUSY_NR_RUN && per_cpu(nr, cpu) < BUSY_NR_RUN) + nr_run_trigger = true; + + if (dequeue) { + u64 load; + + load = cpu_rq(cpu)->hmp_stats.cumulative_runnable_avg; + load = scale_load_to_cpu(load, cpu); + + if (load * BUSY_LOAD_FACTOR > sched_ravg_window) + load_trigger = true; + } + + if (nr_run_trigger || load_trigger) + atomic64_set(&per_cpu(last_busy_time, cpu), curr_time); +} +#else +static inline void update_last_busy_time(int cpu, bool dequeue, + unsigned long prev_nr_run, u64 curr_time) +{ +} +#endif + /** * sched_update_nr_prod * @cpu: The core id of the nr running driver. @@ -147,9 +184,16 @@ void sched_update_nr_prod(int cpu, long delta, bool inc) if (per_cpu(nr, cpu) > per_cpu(nr_max, cpu)) per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + update_last_busy_time(cpu, !inc, nr_running, curr_time); + per_cpu(nr_prod_sum, cpu) += nr_running * diff; per_cpu(nr_big_prod_sum, cpu) += nr_eligible_big_tasks(cpu) * diff; per_cpu(iowait_prod_sum, cpu) += nr_iowait_cpu(cpu) * diff; spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); } EXPORT_SYMBOL(sched_update_nr_prod); + +u64 sched_get_cpu_last_busy_time(int cpu) +{ + return atomic64_read(&per_cpu(last_busy_time, cpu)); +} diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 87e2c9f0c33e..6d74a7c77c8c 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -12,6 +12,28 @@ */ #define SCHEDSTAT_VERSION 15 +#ifdef CONFIG_SMP +static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats) +{ + /* eas-specific runqueue stats */ + seq_printf(seq, "eas %llu %llu %llu %llu %llu %llu ", + stats->sis_attempts, stats->sis_idle, stats->sis_cache_affine, + stats->sis_suff_cap, stats->sis_idle_cpu, stats->sis_count); + + seq_printf(seq, "%llu %llu %llu %llu %llu %llu %llu ", + stats->secb_attempts, stats->secb_sync, stats->secb_idle_bt, + stats->secb_insuff_cap, stats->secb_no_nrg_sav, + stats->secb_nrg_sav, stats->secb_count); + + seq_printf(seq, "%llu %llu %llu %llu %llu ", + stats->fbt_attempts, stats->fbt_no_cpu, stats->fbt_no_sd, + stats->fbt_pref_idle, stats->fbt_count); + + seq_printf(seq, "%llu %llu\n", + stats->cas_attempts, stats->cas_count); +} +#endif + static int show_schedstat(struct seq_file *seq, void *v) { int cpu; @@ -40,6 +62,8 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "\n"); #ifdef CONFIG_SMP + show_easstat(seq, &rq->eas_stats); + /* domain-specific stats */ rcu_read_lock(); for_each_domain(cpu, sd) { @@ -66,6 +90,8 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); + + show_easstat(seq, &sd->eas_stats); } rcu_read_unlock(); #endif diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 134da1cc8fce..3278c81cefb1 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -11,7 +11,8 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { return task_cpu(p); /* stop tasks as never migrate */ } diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index b0c5fe6d1f3b..9c56841227cc 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -12,11 +12,12 @@ #include "tune.h" #ifdef CONFIG_CGROUP_SCHEDTUNE -static bool schedtune_initialized = false; +bool schedtune_initialized = false; #endif unsigned int sysctl_sched_cfs_boost __read_mostly; +extern struct reciprocal_value schedtune_spc_rdiv; extern struct target_nrg schedtune_target_nrg; /* Performance Boost region (B) threshold params */ @@ -675,6 +676,9 @@ int schedtune_task_boost(struct task_struct *p) struct schedtune *st; int task_boost; + if (!unlikely(schedtune_initialized)) + return 0; + /* Get task boost value */ rcu_read_lock(); st = task_schedtune(p); @@ -689,6 +693,9 @@ int schedtune_prefer_idle(struct task_struct *p) struct schedtune *st; int prefer_idle; + if (!unlikely(schedtune_initialized)) + return 0; + /* Get prefer_idle value */ rcu_read_lock(); st = task_schedtune(p); @@ -1121,9 +1128,12 @@ schedtune_init(void) pr_info("schedtune: configured to support global boosting only\n"); #endif + schedtune_spc_rdiv = reciprocal_value(100); + return 0; nodata: + pr_warning("schedtune: disabled!\n"); rcu_read_unlock(); return -EINVAL; } diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 6e053bd9830c..8d25ffbe4fed 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -20,7 +20,6 @@ */ #include <linux/syscore_ops.h> -#include <linux/cpufreq.h> #include <trace/events/sched.h> #include "sched.h" #include "walt.h" @@ -42,40 +41,17 @@ static __read_mostly unsigned int walt_io_is_busy = 0; unsigned int sysctl_sched_walt_init_task_load_pct = 15; -/* 1 -> use PELT based load stats, 0 -> use window-based load stats */ -unsigned int __read_mostly walt_disabled = 0; - -static unsigned int max_possible_efficiency = 1024; -static unsigned int min_possible_efficiency = 1024; +/* true -> use PELT based load stats, false -> use window-based load stats */ +bool __read_mostly walt_disabled = false; /* - * Maximum possible frequency across all cpus. Task demand and cpu - * capacity (cpu_power) metrics are scaled in reference to it. + * Window size (in ns). Adjust for the tick size so that the window + * rollover occurs just before the tick boundary. */ -static unsigned int max_possible_freq = 1; - -/* - * Minimum possible max_freq across all cpus. This will be same as - * max_possible_freq on homogeneous systems and could be different from - * max_possible_freq on heterogenous systems. min_max_freq is used to derive - * capacity (cpu_power) of cpus. - */ -static unsigned int min_max_freq = 1; - -static unsigned int max_load_scale_factor = 1024; -static unsigned int max_possible_capacity = 1024; - -/* Mask of all CPUs that have max_possible_capacity */ -static cpumask_t mpc_mask = CPU_MASK_ALL; - -/* Window size (in ns) */ -__read_mostly unsigned int walt_ravg_window = 20000000; - -/* Min window size (in ns) = 10ms */ -#define MIN_SCHED_RAVG_WINDOW 10000000 - -/* Max window size (in ns) = 1s */ -#define MAX_SCHED_RAVG_WINDOW 1000000000 +__read_mostly unsigned int walt_ravg_window = + (20000000 / TICK_NSEC) * TICK_NSEC; +#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC) +#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC) static unsigned int sync_cpu; static ktime_t ktime_last; @@ -86,11 +62,28 @@ static unsigned int task_load(struct task_struct *p) return p->ravg.demand; } +static inline void fixup_cum_window_demand(struct rq *rq, s64 delta) +{ + rq->cum_window_demand += delta; + if (unlikely((s64)rq->cum_window_demand < 0)) + rq->cum_window_demand = 0; +} + void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { rq->cumulative_runnable_avg += p->ravg.demand; + + /* + * Add a task's contribution to the cumulative window demand when + * + * (1) task is enqueued with on_rq = 1 i.e migration, + * prio/cgroup/class change. + * (2) task is waking for the first time in this window. + */ + if (p->on_rq || (p->last_sleep_ts < rq->window_start)) + fixup_cum_window_demand(rq, p->ravg.demand); } void @@ -99,16 +92,28 @@ walt_dec_cumulative_runnable_avg(struct rq *rq, { rq->cumulative_runnable_avg -= p->ravg.demand; BUG_ON((s64)rq->cumulative_runnable_avg < 0); + + /* + * on_rq will be 1 for sleeping tasks. So check if the task + * is migrating or dequeuing in RUNNING state to change the + * prio/cgroup/class. + */ + if (task_on_rq_migrating(p) || p->state == TASK_RUNNING) + fixup_cum_window_demand(rq, -(s64)p->ravg.demand); } static void fixup_cumulative_runnable_avg(struct rq *rq, - struct task_struct *p, s64 task_load_delta) + struct task_struct *p, u64 new_task_load) { + s64 task_load_delta = (s64)new_task_load - task_load(p); + rq->cumulative_runnable_avg += task_load_delta; if ((s64)rq->cumulative_runnable_avg < 0) panic("cra less than zero: tld: %lld, task_load(p) = %u\n", task_load_delta, task_load(p)); + + fixup_cum_window_demand(rq, task_load_delta); } u64 walt_ktime_clock(void) @@ -167,10 +172,28 @@ static int exiting_task(struct task_struct *p) static int __init set_walt_ravg_window(char *str) { + unsigned int adj_window; + bool no_walt = walt_disabled; + get_option(&str, &walt_ravg_window); - walt_disabled = (walt_ravg_window < MIN_SCHED_RAVG_WINDOW || - walt_ravg_window > MAX_SCHED_RAVG_WINDOW); + /* Adjust for CONFIG_HZ */ + adj_window = (walt_ravg_window / TICK_NSEC) * TICK_NSEC; + + /* Warn if we're a bit too far away from the expected window size */ + WARN(adj_window < walt_ravg_window - NSEC_PER_MSEC, + "tick-adjusted window size %u, original was %u\n", adj_window, + walt_ravg_window); + + walt_ravg_window = adj_window; + + walt_disabled = walt_disabled || + (walt_ravg_window < MIN_SCHED_RAVG_WINDOW || + walt_ravg_window > MAX_SCHED_RAVG_WINDOW); + + WARN(!no_walt && walt_disabled, + "invalid window size, disabling WALT\n"); + return 0; } @@ -194,26 +217,20 @@ update_window_start(struct rq *rq, u64 wallclock) nr_windows = div64_u64(delta, walt_ravg_window); rq->window_start += (u64)nr_windows * (u64)walt_ravg_window; + + rq->cum_window_demand = rq->cumulative_runnable_avg; } +/* + * Translate absolute delta time accounted on a CPU + * to a scale where 1024 is the capacity of the most + * capable CPU running at FMAX + */ static u64 scale_exec_time(u64 delta, struct rq *rq) { - unsigned int cur_freq = rq->cur_freq; - int sf; - - if (unlikely(cur_freq > max_possible_freq)) - cur_freq = rq->max_possible_freq; - - /* round up div64 */ - delta = div64_u64(delta * cur_freq + max_possible_freq - 1, - max_possible_freq); - - sf = DIV_ROUND_UP(rq->efficiency * 1024, max_possible_efficiency); + unsigned long capcurr = capacity_curr_of(cpu_of(rq)); - delta *= sf; - delta >>= 10; - - return delta; + return (delta * capcurr) >> SCHED_CAPACITY_SHIFT; } static int cpu_is_waiting_on_io(struct rq *rq) @@ -590,10 +607,20 @@ static void update_history(struct rq *rq, struct task_struct *p, * A throttled deadline sched class task gets dequeued without * changing p->on_rq. Since the dequeue decrements hmp stats * avoid decrementing it here again. + * + * When window is rolled over, the cumulative window demand + * is reset to the cumulative runnable average (contribution from + * the tasks on the runqueue). If the current task is dequeued + * already, it's demand is not included in the cumulative runnable + * average. So add the task demand separately to cumulative window + * demand. */ - if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || - !p->dl.dl_throttled)) - fixup_cumulative_runnable_avg(rq, p, demand); + if (!task_has_dl_policy(p) || !p->dl.dl_throttled) { + if (task_on_rq_queued(p)) + fixup_cumulative_runnable_avg(rq, p, demand); + else if (rq->curr == p) + fixup_cum_window_demand(rq, demand); + } p->ravg.demand = demand; @@ -736,33 +763,6 @@ done: p->ravg.mark_start = wallclock; } -unsigned long __weak arch_get_cpu_efficiency(int cpu) -{ - return SCHED_LOAD_SCALE; -} - -void walt_init_cpu_efficiency(void) -{ - int i, efficiency; - unsigned int max = 0, min = UINT_MAX; - - for_each_possible_cpu(i) { - efficiency = arch_get_cpu_efficiency(i); - cpu_rq(i)->efficiency = efficiency; - - if (efficiency > max) - max = efficiency; - if (efficiency < min) - min = efficiency; - } - - if (max) - max_possible_efficiency = max; - - if (min) - min_possible_efficiency = min; -} - static void reset_task_stats(struct task_struct *p) { u32 sum = 0; @@ -794,11 +794,11 @@ void walt_set_window_start(struct rq *rq) int cpu = cpu_of(rq); struct rq *sync_rq = cpu_rq(sync_cpu); - if (rq->window_start) + if (likely(rq->window_start)) return; if (cpu == sync_cpu) { - rq->window_start = walt_ktime_clock(); + rq->window_start = 1; } else { raw_spin_unlock(&rq->lock); double_rq_lock(rq, sync_rq); @@ -841,6 +841,17 @@ void walt_fixup_busy_time(struct task_struct *p, int new_cpu) walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0); + /* + * When a task is migrating during the wakeup, adjust + * the task's contribution towards cumulative window + * demand. + */ + if (p->state == TASK_WAKING && + p->last_sleep_ts >= src_rq->window_start) { + fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand); + fixup_cum_window_demand(dest_rq, p->ravg.demand); + } + if (p->ravg.curr_window) { src_rq->curr_runnable_sum -= p->ravg.curr_window; dest_rq->curr_runnable_sum += p->ravg.curr_window; @@ -867,242 +878,6 @@ void walt_fixup_busy_time(struct task_struct *p, int new_cpu) double_rq_unlock(src_rq, dest_rq); } -/* - * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that - * least efficient cpu gets capacity of 1024 - */ -static unsigned long capacity_scale_cpu_efficiency(int cpu) -{ - return (1024 * cpu_rq(cpu)->efficiency) / min_possible_efficiency; -} - -/* - * Return 'capacity' of a cpu in reference to cpu with lowest max_freq - * (min_max_freq), such that one with lowest max_freq gets capacity of 1024. - */ -static unsigned long capacity_scale_cpu_freq(int cpu) -{ - return (1024 * cpu_rq(cpu)->max_freq) / min_max_freq; -} - -/* - * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so - * that "most" efficient cpu gets a load_scale_factor of 1 - */ -static unsigned long load_scale_cpu_efficiency(int cpu) -{ - return DIV_ROUND_UP(1024 * max_possible_efficiency, - cpu_rq(cpu)->efficiency); -} - -/* - * Return load_scale_factor of a cpu in reference to cpu with best max_freq - * (max_possible_freq), so that one with best max_freq gets a load_scale_factor - * of 1. - */ -static unsigned long load_scale_cpu_freq(int cpu) -{ - return DIV_ROUND_UP(1024 * max_possible_freq, cpu_rq(cpu)->max_freq); -} - -static int compute_capacity(int cpu) -{ - int capacity = 1024; - - capacity *= capacity_scale_cpu_efficiency(cpu); - capacity >>= 10; - - capacity *= capacity_scale_cpu_freq(cpu); - capacity >>= 10; - - return capacity; -} - -static int compute_load_scale_factor(int cpu) -{ - int load_scale = 1024; - - /* - * load_scale_factor accounts for the fact that task load - * is in reference to "best" performing cpu. Task's load will need to be - * scaled (up) by a factor to determine suitability to be placed on a - * (little) cpu. - */ - load_scale *= load_scale_cpu_efficiency(cpu); - load_scale >>= 10; - - load_scale *= load_scale_cpu_freq(cpu); - load_scale >>= 10; - - return load_scale; -} - -static int cpufreq_notifier_policy(struct notifier_block *nb, - unsigned long val, void *data) -{ - struct cpufreq_policy *policy = (struct cpufreq_policy *)data; - int i, update_max = 0; - u64 highest_mpc = 0, highest_mplsf = 0; - const struct cpumask *cpus = policy->related_cpus; - unsigned int orig_min_max_freq = min_max_freq; - unsigned int orig_max_possible_freq = max_possible_freq; - /* Initialized to policy->max in case policy->related_cpus is empty! */ - unsigned int orig_max_freq = policy->max; - - if (val != CPUFREQ_NOTIFY) - return 0; - - for_each_cpu(i, policy->related_cpus) { - cpumask_copy(&cpu_rq(i)->freq_domain_cpumask, - policy->related_cpus); - orig_max_freq = cpu_rq(i)->max_freq; - cpu_rq(i)->min_freq = policy->min; - cpu_rq(i)->max_freq = policy->max; - cpu_rq(i)->cur_freq = policy->cur; - cpu_rq(i)->max_possible_freq = policy->cpuinfo.max_freq; - } - - max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq); - if (min_max_freq == 1) - min_max_freq = UINT_MAX; - min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq); - BUG_ON(!min_max_freq); - BUG_ON(!policy->max); - - /* Changes to policy other than max_freq don't require any updates */ - if (orig_max_freq == policy->max) - return 0; - - /* - * A changed min_max_freq or max_possible_freq (possible during bootup) - * needs to trigger re-computation of load_scale_factor and capacity for - * all possible cpus (even those offline). It also needs to trigger - * re-computation of nr_big_task count on all online cpus. - * - * A changed rq->max_freq otoh needs to trigger re-computation of - * load_scale_factor and capacity for just the cluster of cpus involved. - * Since small task definition depends on max_load_scale_factor, a - * changed load_scale_factor of one cluster could influence - * classification of tasks in another cluster. Hence a changed - * rq->max_freq will need to trigger re-computation of nr_big_task - * count on all online cpus. - * - * While it should be sufficient for nr_big_tasks to be - * re-computed for only online cpus, we have inadequate context - * information here (in policy notifier) with regard to hotplug-safety - * context in which notification is issued. As a result, we can't use - * get_online_cpus() here, as it can lead to deadlock. Until cpufreq is - * fixed up to issue notification always in hotplug-safe context, - * re-compute nr_big_task for all possible cpus. - */ - - if (orig_min_max_freq != min_max_freq || - orig_max_possible_freq != max_possible_freq) { - cpus = cpu_possible_mask; - update_max = 1; - } - - /* - * Changed load_scale_factor can trigger reclassification of tasks as - * big or small. Make this change "atomic" so that tasks are accounted - * properly due to changed load_scale_factor - */ - for_each_cpu(i, cpus) { - struct rq *rq = cpu_rq(i); - - rq->capacity = compute_capacity(i); - rq->load_scale_factor = compute_load_scale_factor(i); - - if (update_max) { - u64 mpc, mplsf; - - mpc = div_u64(((u64) rq->capacity) * - rq->max_possible_freq, rq->max_freq); - rq->max_possible_capacity = (int) mpc; - - mplsf = div_u64(((u64) rq->load_scale_factor) * - rq->max_possible_freq, rq->max_freq); - - if (mpc > highest_mpc) { - highest_mpc = mpc; - cpumask_clear(&mpc_mask); - cpumask_set_cpu(i, &mpc_mask); - } else if (mpc == highest_mpc) { - cpumask_set_cpu(i, &mpc_mask); - } - - if (mplsf > highest_mplsf) - highest_mplsf = mplsf; - } - } - - if (update_max) { - max_possible_capacity = highest_mpc; - max_load_scale_factor = highest_mplsf; - } - - return 0; -} - -static int cpufreq_notifier_trans(struct notifier_block *nb, - unsigned long val, void *data) -{ - struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; - unsigned int cpu = freq->cpu, new_freq = freq->new; - unsigned long flags; - int i; - - if (val != CPUFREQ_POSTCHANGE) - return 0; - - BUG_ON(!new_freq); - - if (cpu_rq(cpu)->cur_freq == new_freq) - return 0; - - for_each_cpu(i, &cpu_rq(cpu)->freq_domain_cpumask) { - struct rq *rq = cpu_rq(i); - - raw_spin_lock_irqsave(&rq->lock, flags); - walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, - walt_ktime_clock(), 0); - rq->cur_freq = new_freq; - raw_spin_unlock_irqrestore(&rq->lock, flags); - } - - return 0; -} - -static struct notifier_block notifier_policy_block = { - .notifier_call = cpufreq_notifier_policy -}; - -static struct notifier_block notifier_trans_block = { - .notifier_call = cpufreq_notifier_trans -}; - -static int register_sched_callback(void) -{ - int ret; - - ret = cpufreq_register_notifier(¬ifier_policy_block, - CPUFREQ_POLICY_NOTIFIER); - - if (!ret) - ret = cpufreq_register_notifier(¬ifier_trans_block, - CPUFREQ_TRANSITION_NOTIFIER); - - return 0; -} - -/* - * cpufreq callbacks can be registered at core_initcall or later time. - * Any registration done prior to that is "forgotten" by cpufreq. See - * initialization of variable init_cpufreq_transition_notifier_list_called - * for further information. - */ -core_initcall(register_sched_callback); - void walt_init_new_task_load(struct task_struct *p) { int i; diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h index e181c87a928d..de7edac43674 100644 --- a/kernel/sched/walt.h +++ b/kernel/sched/walt.h @@ -55,8 +55,10 @@ static inline void walt_migrate_sync_cpu(int cpu) { } static inline void walt_init_cpu_efficiency(void) { } static inline u64 walt_ktime_clock(void) { return 0; } +#define walt_cpu_high_irqload(cpu) false + #endif /* CONFIG_SCHED_WALT */ -extern unsigned int walt_disabled; +extern bool walt_disabled; #endif diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 15a1795bbba1..efd384f3f852 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -457,14 +457,19 @@ static long seccomp_attach_filter(unsigned int flags, return 0; } +void __get_seccomp_filter(struct seccomp_filter *filter) +{ + /* Reference count is bounded by the number of total processes. */ + atomic_inc(&filter->usage); +} + /* get_seccomp_filter - increments the reference count of the filter on @tsk */ void get_seccomp_filter(struct task_struct *tsk) { struct seccomp_filter *orig = tsk->seccomp.filter; if (!orig) return; - /* Reference count is bounded by the number of total processes. */ - atomic_inc(&orig->usage); + __get_seccomp_filter(orig); } static inline void seccomp_filter_free(struct seccomp_filter *filter) @@ -475,10 +480,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter) } } -/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ -void put_seccomp_filter(struct task_struct *tsk) +static void __put_seccomp_filter(struct seccomp_filter *orig) { - struct seccomp_filter *orig = tsk->seccomp.filter; /* Clean up single-reference branches iteratively. */ while (orig && atomic_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; @@ -487,6 +490,12 @@ void put_seccomp_filter(struct task_struct *tsk) } } +/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ +void put_seccomp_filter(struct task_struct *tsk) +{ + __put_seccomp_filter(tsk->seccomp.filter); +} + /** * seccomp_send_sigsys - signals the task to allow in-process syscall emulation * @syscall: syscall number to send to userland @@ -927,13 +936,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, if (!data) goto out; - get_seccomp_filter(task); + __get_seccomp_filter(filter); spin_unlock_irq(&task->sighand->siglock); if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) ret = -EFAULT; - put_seccomp_filter(task); + __put_seccomp_filter(filter); return ret; out: diff --git a/kernel/signal.c b/kernel/signal.c index f3f1f7a972fd..4a548c6a4118 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -72,7 +72,7 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force) handler = sig_handler(t, sig); if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && - handler == SIG_DFL && !force) + handler == SIG_DFL && !(force && sig_kernel_only(sig))) return 1; return sig_handler_ignored(handler, sig); @@ -88,13 +88,15 @@ static int sig_ignored(struct task_struct *t, int sig, bool force) if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return 0; - if (!sig_task_ignored(t, sig, force)) - return 0; - /* - * Tracers may want to know about even ignored signals. + * Tracers may want to know about even ignored signal unless it + * is SIGKILL which can't be reported anyway but can be ignored + * by SIGNAL_UNKILLABLE task. */ - return !t->ptrace; + if (t->ptrace && sig != SIGKILL) + return 0; + + return sig_task_ignored(t, sig, force); } /* @@ -346,7 +348,7 @@ static bool task_participate_group_stop(struct task_struct *task) * fresh group stop. Read comment in do_signal_stop() for details. */ if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { - sig->flags = SIGNAL_STOP_STOPPED; + signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED); return true; } return false; @@ -503,7 +505,8 @@ int unhandled_signal(struct task_struct *tsk, int sig) return !tsk->ptrace; } -static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) +static void collect_signal(int sig, struct sigpending *list, siginfo_t *info, + bool *resched_timer) { struct sigqueue *q, *first = NULL; @@ -525,6 +528,12 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) still_pending: list_del_init(&first->list); copy_siginfo(info, &first->info); + + *resched_timer = + (first->flags & SIGQUEUE_PREALLOC) && + (info->si_code == SI_TIMER) && + (info->si_sys_private); + __sigqueue_free(first); } else { /* @@ -541,12 +550,12 @@ still_pending: } static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, - siginfo_t *info) + siginfo_t *info, bool *resched_timer) { int sig = next_signal(pending, mask); if (sig) - collect_signal(sig, pending, info); + collect_signal(sig, pending, info, resched_timer); return sig; } @@ -558,15 +567,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, */ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) { + bool resched_timer = false; int signr; /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ - signr = __dequeue_signal(&tsk->pending, mask, info); + signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer); if (!signr) { signr = __dequeue_signal(&tsk->signal->shared_pending, - mask, info); + mask, info, &resched_timer); /* * itimer signal ? * @@ -611,7 +621,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) */ current->jobctl |= JOBCTL_STOP_DEQUEUED; } - if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { + if (resched_timer) { /* * Release the siglock to ensure proper locking order * of timer locks outside of siglocks. Note, we leave @@ -837,7 +847,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) * will take ->siglock, notice SIGNAL_CLD_MASK, and * notify its parent. See get_signal_to_deliver(). */ - signal->flags = why | SIGNAL_STOP_CONTINUED; + signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED); signal->group_stop_count = 0; signal->group_exit_code = 0; } @@ -909,9 +919,9 @@ static void complete_signal(int sig, struct task_struct *p, int group) * then start taking the whole group down immediately. */ if (sig_fatal(p, sig) && - !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && + !(signal->flags & SIGNAL_GROUP_EXIT) && !sigismember(&t->real_blocked, sig) && - (sig == SIGKILL || !t->ptrace)) { + (sig == SIGKILL || !p->ptrace)) { /* * This signal will be fatal to the whole group. */ diff --git a/kernel/softirq.c b/kernel/softirq.c index 479e4436f787..d69b77fc7cc1 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -57,6 +57,13 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp DEFINE_PER_CPU(struct task_struct *, ksoftirqd); +/* + * active_softirqs -- per cpu, a mask of softirqs that are being handled, + * with the expectation that approximate answers are acceptable and therefore + * no synchronization. + */ +DEFINE_PER_CPU(__u32, active_softirqs); + const char * const softirq_to_name[NR_SOFTIRQS] = { "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", "TASKLET", "SCHED", "HRTIMER", "RCU" @@ -227,13 +234,24 @@ static inline bool lockdep_softirq_start(void) { return false; } static inline void lockdep_softirq_end(bool in_hardirq) { } #endif -asmlinkage __visible void __do_softirq(void) +#define softirq_deferred_for_rt(pending) \ +({ \ + __u32 deferred = 0; \ + if (cpupri_check_rt()) { \ + deferred = pending & LONG_SOFTIRQ_MASK; \ + pending &= ~LONG_SOFTIRQ_MASK; \ + } \ + deferred; \ +}) + +asmlinkage __visible void __softirq_entry __do_softirq(void) { unsigned long end = jiffies + MAX_SOFTIRQ_TIME; unsigned long old_flags = current->flags; int max_restart = MAX_SOFTIRQ_RESTART; struct softirq_action *h; bool in_hardirq; + __u32 deferred; __u32 pending; int softirq_bit; @@ -245,14 +263,15 @@ asmlinkage __visible void __do_softirq(void) current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); + deferred = softirq_deferred_for_rt(pending); account_irq_enter_time(current); - __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); in_hardirq = lockdep_softirq_start(); restart: /* Reset the pending bitmask before enabling irqs */ - set_softirq_pending(0); + set_softirq_pending(deferred); + __this_cpu_write(active_softirqs, pending); local_irq_enable(); @@ -282,18 +301,21 @@ restart: pending >>= softirq_bit; } + __this_cpu_write(active_softirqs, 0); rcu_bh_qs(); local_irq_disable(); pending = local_softirq_pending(); + deferred = softirq_deferred_for_rt(pending); + if (pending) { if (time_before(jiffies, end) && !need_resched() && --max_restart) goto restart; - - wakeup_softirqd(); } + if (pending | deferred) + wakeup_softirqd(); lockdep_softirq_end(in_hardirq); account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 816999804a16..bc4ca30ddc21 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -522,13 +522,6 @@ static struct ctl_table kern_table[] = { .extra2 = &max_sched_granularity_ns, }, { - .procname = "sched_is_big_little", - .data = &sysctl_sched_is_big_little, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "sched_sync_hint_enable", .data = &sysctl_sched_sync_hint_enable, .maxlen = sizeof(unsigned int), @@ -536,13 +529,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "sched_initial_task_util", - .data = &sysctl_sched_initial_task_util, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "sched_cstate_aware", .data = &sysctl_sched_cstate_aware, .maxlen = sizeof(unsigned int), @@ -1420,6 +1406,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = timer_migration_handler, + .extra1 = &zero, + .extra2 = &one, }, #endif #ifdef CONFIG_BPF_SYSCALL @@ -2387,9 +2375,12 @@ static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp, if (write) { if (*negp) return -EINVAL; + if (*lvalp > UINT_MAX) + return -EINVAL; *valp = *lvalp; } else { unsigned int val = *valp; + *negp = false; *lvalp = (unsigned long)val; } return 0; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 4a816bab38a2..d7612fcba10a 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -255,6 +255,7 @@ static const struct bin_table bin_net_ipv4_conf_vars_table[] = { { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" }, { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" }, { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, + { CTL_INT, NET_IPV4_CONF_NF_IPV4_DEFRAG_SKIP, "nf_ipv4_defrag_skip" }, {} }; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 2af5687b83c9..ceec77c652b5 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -569,7 +569,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; - start = ktime_add(start, base->gettime()); + start = ktime_add_safe(start, base->gettime()); alarm_start(alarm, start); } EXPORT_SYMBOL_GPL(alarm_start_relative); @@ -655,7 +655,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval) overrun++; } - alarm->node.expires = ktime_add(alarm->node.expires, interval); + alarm->node.expires = ktime_add_safe(alarm->node.expires, interval); return overrun; } EXPORT_SYMBOL_GPL(alarm_forward); @@ -843,13 +843,22 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, /* start the timer */ timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); + + /* + * Rate limit to the tick as a hot fix to prevent DOS. Will be + * mopped up later. + */ + if (timr->it.alarm.interval.tv64 && + ktime_to_ns(timr->it.alarm.interval) < TICK_NSEC) + timr->it.alarm.interval = ktime_set(0, TICK_NSEC); + exp = timespec_to_ktime(new_setting->it_value); /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { ktime_t now; now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); - exp = ktime_add(now, exp); + exp = ktime_add_safe(now, exp); } alarm_start(&timr->it.alarm.alarmtimer, exp); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e7c2392666cb..beafdf94b3b5 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -312,7 +312,7 @@ EXPORT_SYMBOL_GPL(__ktime_divns); */ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) { - ktime_t res = ktime_add(lhs, rhs); + ktime_t res = ktime_add_unsafe(lhs, rhs); /* * We use KTIME_SEC_MAX here, the maximum timeout which we can @@ -669,7 +669,9 @@ static void hrtimer_reprogram(struct hrtimer *timer, static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { base->expires_next.tv64 = KTIME_MAX; + base->hang_detected = 0; base->hres_active = 0; + base->next_timer = NULL; } /* @@ -1116,7 +1118,12 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, cpu_base = raw_cpu_ptr(&hrtimer_bases); - if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) + /* + * POSIX magic: Relative CLOCK_REALTIME timers are not affected by + * clock modifications, so they needs to become CLOCK_MONOTONIC to + * ensure POSIX compliance. + */ + if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) clock_id = CLOCK_MONOTONIC; base = hrtimer_clockid_to_base(clock_id); @@ -1587,6 +1594,7 @@ static void init_hrtimers_cpu(int cpu) timerqueue_init_head(&cpu_base->clock_base[i].active); } + cpu_base->active_bases = 0; cpu_base->cpu = cpu; hrtimer_init_hres(cpu_base); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index f2826c35e918..fc7c37ad90a0 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -507,17 +507,22 @@ static struct pid *good_sigevent(sigevent_t * event) { struct task_struct *rtn = current->group_leader; - if ((event->sigev_notify & SIGEV_THREAD_ID ) && - (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || - !same_thread_group(rtn, current) || - (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) + switch (event->sigev_notify) { + case SIGEV_SIGNAL | SIGEV_THREAD_ID: + rtn = find_task_by_vpid(event->sigev_notify_thread_id); + if (!rtn || !same_thread_group(rtn, current)) + return NULL; + /* FALLTHRU */ + case SIGEV_SIGNAL: + case SIGEV_THREAD: + if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) + return NULL; + /* FALLTHRU */ + case SIGEV_NONE: + return task_pid(rtn); + default: return NULL; - - if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && - ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) - return NULL; - - return task_pid(rtn); + } } void posix_timers_register_clock(const clockid_t clock_id, @@ -745,8 +750,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) /* interval timer ? */ if (iv.tv64) cur_setting->it_interval = ktime_to_timespec(iv); - else if (!hrtimer_active(timer) && - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) + else if (!hrtimer_active(timer) && timr->it_sigev_notify != SIGEV_NONE) return; now = timer->base->get_time(); @@ -757,7 +761,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) * expiry is > now. */ if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING || - (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) + timr->it_sigev_notify == SIGEV_NONE)) timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); remaining = __hrtimer_expires_remaining_adjusted(timer, now); @@ -767,7 +771,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) * A single shot SIGEV_NONE timer must return 0, when * it is expired ! */ - if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) + if (timr->it_sigev_notify != SIGEV_NONE) cur_setting->it_value.tv_nsec = 1; } else cur_setting->it_value = ktime_to_timespec(remaining); @@ -865,7 +869,7 @@ common_timer_set(struct k_itimer *timr, int flags, timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); /* SIGEV_NONE timers are not queued ! See common_timer_get */ - if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { + if (timr->it_sigev_notify == SIGEV_NONE) { /* Setup correct expiry time for relative timers */ if (mode == HRTIMER_MODE_REL) { hrtimer_add_expires(timer, timer->base->get_time()); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index ec2102104cb8..6579be96e041 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -589,6 +589,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } +static inline bool local_timer_softirq_pending(void) +{ + return local_softirq_pending() & TIMER_SOFTIRQ; +} + static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now, int cpu) { @@ -605,8 +610,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } while (read_seqretry(&jiffies_lock, seq)); ts->last_jiffies = basejiff; - if (rcu_needs_cpu(basemono, &next_rcu) || - arch_needs_cpu() || irq_work_needs_cpu()) { + /* + * Keep the periodic tick, when RCU, architecture or irq_work + * requests it. + * Aside of that check whether the local timer softirq is + * pending. If so its a bad idea to call get_next_timer_interrupt() + * because there is an already expired timer, so it will request + * immeditate expiry, which rearms the hardware timer with a + * minimal delta which brings us back to this place + * immediately. Lather, rinse and repeat... + */ + if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || + irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* @@ -896,6 +911,18 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } +/** + * tick_nohz_get_idle_calls - return the current idle calls counter value + * + * Called from the schedutil frequency scaling governor in scheduler context. + */ +unsigned long tick_nohz_get_idle_calls(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->idle_calls; +} + static void tick_nohz_account_idle_ticks(struct tick_sched *ts) { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5fa544f3f560..7902ecbce8ec 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -70,6 +70,10 @@ static inline void tk_normalize_xtime(struct timekeeper *tk) tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec++; } + while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) { + tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift; + tk->raw_sec++; + } } static inline struct timespec64 tk_xtime(struct timekeeper *tk) @@ -116,6 +120,26 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) tk->offs_boot = ktime_add(tk->offs_boot, delta); } +/* + * tk_clock_read - atomic clocksource read() helper + * + * This helper is necessary to use in the read paths because, while the + * seqlock ensures we don't return a bad value while structures are updated, + * it doesn't protect from potential crashes. There is the possibility that + * the tkr's clocksource may change between the read reference, and the + * clock reference passed to the read function. This can cause crashes if + * the wrong clocksource is passed to the wrong read function. + * This isn't necessary to use when holding the timekeeper_lock or doing + * a read of the fast-timekeeper tkrs (which is protected by its own locking + * and update logic). + */ +static inline u64 tk_clock_read(struct tk_read_base *tkr) +{ + struct clocksource *clock = READ_ONCE(tkr->clock); + + return clock->read(clock); +} + #ifdef CONFIG_DEBUG_TIMEKEEPING #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ @@ -173,7 +197,7 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) */ do { seq = read_seqcount_begin(&tk_core.seq); - now = tkr->read(tkr->clock); + now = tk_clock_read(tkr); last = tkr->cycle_last; mask = tkr->mask; max = tkr->clock->max_cycles; @@ -207,7 +231,7 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) cycle_t cycle_now, delta; /* read clocksource */ - cycle_now = tkr->read(tkr->clock); + cycle_now = tk_clock_read(tkr); /* calculate the delta since the last update_wall_time */ delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); @@ -235,12 +259,10 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; - tk->tkr_mono.read = clock->read; tk->tkr_mono.mask = clock->mask; - tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock); + tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono); tk->tkr_raw.clock = clock; - tk->tkr_raw.read = clock->read; tk->tkr_raw.mask = clock->mask; tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; @@ -259,18 +281,19 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) /* Go back from cycles -> shifted ns */ tk->xtime_interval = (u64) interval * clock->mult; tk->xtime_remainder = ntpinterval - tk->xtime_interval; - tk->raw_interval = - ((u64) interval * clock->mult) >> clock->shift; + tk->raw_interval = interval * clock->mult; /* if changing clocks, convert xtime_nsec shift units */ if (old_clock) { int shift_change = clock->shift - old_clock->shift; - if (shift_change < 0) + if (shift_change < 0) { tk->tkr_mono.xtime_nsec >>= -shift_change; - else + tk->tkr_raw.xtime_nsec >>= -shift_change; + } else { tk->tkr_mono.xtime_nsec <<= shift_change; + tk->tkr_raw.xtime_nsec <<= shift_change; + } } - tk->tkr_raw.xtime_nsec = 0; tk->tkr_mono.shift = clock->shift; tk->tkr_raw.shift = clock->shift; @@ -404,7 +427,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) now += timekeeping_delta_to_ns(tkr, clocksource_delta( - tkr->read(tkr->clock), + tk_clock_read(tkr), tkr->cycle_last, tkr->mask)); } while (read_seqcount_retry(&tkf->seq, seq)); @@ -461,6 +484,10 @@ static cycle_t dummy_clock_read(struct clocksource *cs) return cycles_at_suspend; } +static struct clocksource dummy_clock = { + .read = dummy_clock_read, +}; + /** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. @@ -477,13 +504,13 @@ static void halt_fast_timekeeper(struct timekeeper *tk) struct tk_read_base *tkr = &tk->tkr_mono; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); - cycles_at_suspend = tkr->read(tkr->clock); - tkr_dummy.read = dummy_clock_read; + cycles_at_suspend = tk_clock_read(tkr); + tkr_dummy.clock = &dummy_clock; update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); tkr = &tk->tkr_raw; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); - tkr_dummy.read = dummy_clock_read; + tkr_dummy.clock = &dummy_clock; update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); } @@ -595,9 +622,6 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) nsec = (u32) tk->wall_to_monotonic.tv_nsec; tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); - /* Update the monotonic raw base */ - tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time); - /* * The sum of the nanoseconds portions of xtime and * wall_to_monotonic can be greater/equal one second. Take @@ -607,6 +631,9 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) if (nsec >= NSEC_PER_SEC) seconds++; tk->ktime_sec = seconds; + + /* Update the monotonic raw base */ + tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } /* must hold timekeeper_lock */ @@ -647,11 +674,9 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) */ static void timekeeping_forward_now(struct timekeeper *tk) { - struct clocksource *clock = tk->tkr_mono.clock; cycle_t cycle_now, delta; - s64 nsec; - cycle_now = tk->tkr_mono.read(clock); + cycle_now = tk_clock_read(&tk->tkr_mono); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; @@ -661,10 +686,13 @@ static void timekeeping_forward_now(struct timekeeper *tk) /* If arch requires, add in get_arch_timeoffset() */ tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; - tk_normalize_xtime(tk); - nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift); - timespec64_add_ns(&tk->raw_time, nsec); + tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult; + + /* If arch requires, add in get_arch_timeoffset() */ + tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift; + + tk_normalize_xtime(tk); } /** @@ -1158,19 +1186,18 @@ int timekeeping_notify(struct clocksource *clock) void getrawmonotonic64(struct timespec64 *ts) { struct timekeeper *tk = &tk_core.timekeeper; - struct timespec64 ts64; unsigned long seq; s64 nsecs; do { seq = read_seqcount_begin(&tk_core.seq); + ts->tv_sec = tk->raw_sec; nsecs = timekeeping_get_ns(&tk->tkr_raw); - ts64 = tk->raw_time; } while (read_seqcount_retry(&tk_core.seq, seq)); - timespec64_add_ns(&ts64, nsecs); - *ts = ts64; + ts->tv_nsec = 0; + timespec64_add_ns(ts, nsecs); } EXPORT_SYMBOL(getrawmonotonic64); @@ -1294,8 +1321,7 @@ void __init timekeeping_init(void) tk_setup_internals(tk, clock); tk_set_xtime(tk, &now); - tk->raw_time.tv_sec = 0; - tk->raw_time.tv_nsec = 0; + tk->raw_sec = 0; if (boot.tv_sec == 0 && boot.tv_nsec == 0) boot = tk_xtime(tk); @@ -1434,7 +1460,7 @@ void timekeeping_resume(void) * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ - cycle_now = tk->tkr_mono.read(clock); + cycle_now = tk_clock_read(&tk->tkr_mono); if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && cycle_now > tk->tkr_mono.cycle_last) { u64 num, max = ULLONG_MAX; @@ -1775,7 +1801,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, unsigned int *clock_set) { cycle_t interval = tk->cycle_interval << shift; - u64 raw_nsecs; + u64 snsec_per_sec; /* If the offset is smaller than a shifted interval, do nothing */ if (offset < interval) @@ -1790,14 +1816,12 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ - raw_nsecs = (u64)tk->raw_interval << shift; - raw_nsecs += tk->raw_time.tv_nsec; - if (raw_nsecs >= NSEC_PER_SEC) { - u64 raw_secs = raw_nsecs; - raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); - tk->raw_time.tv_sec += raw_secs; + tk->tkr_raw.xtime_nsec += tk->raw_interval << shift; + snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift; + while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) { + tk->tkr_raw.xtime_nsec -= snsec_per_sec; + tk->raw_sec++; } - tk->raw_time.tv_nsec = raw_nsecs; /* Accumulate error between NTP and clock interval */ tk->ntp_error += tk->ntp_tick << shift; @@ -1829,7 +1853,7 @@ void update_wall_time(void) #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = real_tk->cycle_interval; #else - offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock), + offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask); #endif diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 90a82deece45..903705687b52 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -131,7 +131,7 @@ int timer_migration_handler(struct ctl_table *table, int write, int ret; mutex_lock(&mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) timers_update_migration(false); mutex_unlock(&mutex); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 048bf074bef9..3c7b7a9bcad1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -190,6 +190,17 @@ config FUNCTION_GRAPH_TRACER address on the current task structure into a stack of calls. +config PREEMPTIRQ_EVENTS + bool "Enable trace events for preempt and irq disable/enable" + select TRACE_IRQFLAGS + depends on DEBUG_PREEMPT || !PROVE_LOCKING + default n + help + Enable tracing of disable and enable events for preemption and irqs. + For tracing preempt disable/enable events, DEBUG_PREEMPT must be + enabled. For tracing irq disable/enable events, PROVE_LOCKING must + be disabled. + config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2963266fb7bf..a0177ae43058 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o +obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7b6127653a37..b674a7a8d655 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -57,7 +57,8 @@ static struct tracer_flags blk_tracer_flags = { }; /* Global reference count of probes */ -static atomic_t blk_probes_ref = ATOMIC_INIT(0); +static DEFINE_MUTEX(blk_probe_mutex); +static int blk_probes_ref; static void blk_register_tracepoints(void); static void blk_unregister_tracepoints(void); @@ -300,11 +301,26 @@ static void blk_trace_free(struct blk_trace *bt) kfree(bt); } +static void get_probe_ref(void) +{ + mutex_lock(&blk_probe_mutex); + if (++blk_probes_ref == 1) + blk_register_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + +static void put_probe_ref(void) +{ + mutex_lock(&blk_probe_mutex); + if (!--blk_probes_ref) + blk_unregister_tracepoints(); + mutex_unlock(&blk_probe_mutex); +} + static void blk_trace_cleanup(struct blk_trace *bt) { blk_trace_free(bt); - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); + put_probe_ref(); } int blk_trace_remove(struct request_queue *q) @@ -522,8 +538,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (cmpxchg(&q->blk_trace, NULL, bt)) goto err; - if (atomic_inc_return(&blk_probes_ref) == 1) - blk_register_tracepoints(); + get_probe_ref(); return 0; err: @@ -1518,9 +1533,7 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt == NULL) return -EINVAL; - if (atomic_dec_and_test(&blk_probes_ref)) - blk_unregister_tracepoints(); - + put_probe_ref(); blk_trace_free(bt); return 0; } @@ -1551,8 +1564,7 @@ static int blk_trace_setup_queue(struct request_queue *q, if (cmpxchg(&q->blk_trace, NULL, bt)) goto free_bt; - if (atomic_inc_return(&blk_probes_ref) == 1) - blk_register_tracepoints(); + get_probe_ref(); return 0; free_bt: diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 34b2a0d5cf1a..ac758a53fcea 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2667,13 +2667,14 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (!command || !ftrace_enabled) { /* - * If these are control ops, they still need their - * per_cpu field freed. Since, function tracing is + * If these are dynamic or control ops, they still + * need their data freed. Since, function tracing is * not currently active, we can just free them * without synchronizing all CPUs. */ - if (ops->flags & FTRACE_OPS_FL_CONTROL) - control_ops_free(ops); + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) + goto free_ops; + return 0; } @@ -2728,6 +2729,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { schedule_on_each_cpu(ftrace_sync); + free_ops: arch_ftrace_trampoline_free(ops); if (ops->flags & FTRACE_OPS_FL_CONTROL) @@ -3535,7 +3537,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod) int exclude_mod = 0; int found = 0; int ret; - int clear_filter; + int clear_filter = 0; if (func) { func_g.type = filter_parse_regex(func, len, &func_g.search, @@ -3843,7 +3845,6 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, func_g.type = filter_parse_regex(glob, strlen(glob), &func_g.search, ¬); func_g.len = strlen(func_g.search); - func_g.search = glob; /* we do not support '!' for function probes */ if (WARN_ON(not)) @@ -4313,9 +4314,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); -static unsigned long save_global_trampoline; -static unsigned long save_global_flags; - static int __init set_graph_function(char *str) { strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); @@ -5905,17 +5903,6 @@ void unregister_ftrace_graph(void) unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -#ifdef CONFIG_DYNAMIC_FTRACE - /* - * Function graph does not allocate the trampoline, but - * other global_ops do. We need to reset the ALLOC_TRAMP flag - * if one was used. - */ - global_ops.trampoline = save_global_trampoline; - if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP) - global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP; -#endif - out: mutex_unlock(&ftrace_lock); } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1275175b0946..d9cd6191760b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -280,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) +#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED) + struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ @@ -331,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage) */ size_t ring_buffer_page_len(void *page) { - return local_read(&((struct buffer_data_page *)page)->commit) + struct buffer_data_page *bpage = page; + + return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS) + BUF_PAGE_HDR_SIZE; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 60d246c4eefa..9510d540b48e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1362,7 +1362,7 @@ static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; struct saved_cmdlines_buffer { unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; unsigned *map_cmdline_to_pid; - unsigned *saved_tgids; + unsigned *map_cmdline_to_tgid; unsigned cmdline_num; int cmdline_idx; char *saved_cmdlines; @@ -1396,9 +1396,10 @@ static int allocate_cmdlines_buffer(unsigned int val, return -ENOMEM; } - s->saved_tgids = kmalloc_array(val, sizeof(*s->saved_tgids), - GFP_KERNEL); - if (!s->saved_tgids) { + s->map_cmdline_to_tgid = kmalloc_array(val, + sizeof(*s->map_cmdline_to_tgid), + GFP_KERNEL); + if (!s->map_cmdline_to_tgid) { kfree(s->map_cmdline_to_pid); kfree(s->saved_cmdlines); return -ENOMEM; @@ -1410,8 +1411,8 @@ static int allocate_cmdlines_buffer(unsigned int val, sizeof(s->map_pid_to_cmdline)); memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, val * sizeof(*s->map_cmdline_to_pid)); - memset(s->saved_tgids, 0, - val * sizeof(*s->saved_tgids)); + memset(s->map_cmdline_to_tgid, NO_CMDLINE_MAP, + val * sizeof(*s->map_cmdline_to_tgid)); return 0; } @@ -1577,14 +1578,17 @@ static int trace_save_cmdline(struct task_struct *tsk) if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) return 0; + preempt_disable(); /* * It's not the end of the world if we don't get * the lock, but we also don't want to spin * nor do we want to disable interrupts, * so if we miss here, then better luck next time. */ - if (!arch_spin_trylock(&trace_cmdline_lock)) + if (!arch_spin_trylock(&trace_cmdline_lock)) { + preempt_enable(); return 0; + } idx = savedcmd->map_pid_to_cmdline[tsk->pid]; if (idx == NO_CMDLINE_MAP) { @@ -1607,8 +1611,9 @@ static int trace_save_cmdline(struct task_struct *tsk) } set_cmdline(idx, tsk->comm); - savedcmd->saved_tgids[idx] = tsk->tgid; + savedcmd->map_cmdline_to_tgid[idx] = tsk->tgid; arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); return 1; } @@ -1650,19 +1655,29 @@ void trace_find_cmdline(int pid, char comm[]) preempt_enable(); } -int trace_find_tgid(int pid) +static int __find_tgid_locked(int pid) { unsigned map; int tgid; - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); map = savedcmd->map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) - tgid = savedcmd->saved_tgids[map]; + tgid = savedcmd->map_cmdline_to_tgid[map]; else tgid = -1; + return tgid; +} + +int trace_find_tgid(int pid) +{ + int tgid; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + tgid = __find_tgid_locked(pid); + arch_spin_unlock(&trace_cmdline_lock); preempt_enable(); @@ -3288,11 +3303,17 @@ static int tracing_open(struct inode *inode, struct file *file) /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { int cpu = tracing_get_cpu(inode); + struct trace_buffer *trace_buf = &tr->trace_buffer; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->current_trace->print_max) + trace_buf = &tr->max_buffer; +#endif if (cpu == RING_BUFFER_ALL_CPUS) - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(trace_buf); else - tracing_reset(&tr->trace_buffer, cpu); + tracing_reset(trace_buf, cpu); } if (file->f_mode & FMODE_READ) { @@ -3440,37 +3461,30 @@ static const struct file_operations show_traces_fops = { .llseek = seq_lseek, }; -/* - * The tracer itself will not take this lock, but still we want - * to provide a consistent cpumask to user-space: - */ -static DEFINE_MUTEX(tracing_cpumask_update_lock); - -/* - * Temporary storage for the character representation of the - * CPU bitmask (and one more byte for the newline): - */ -static char mask_str[NR_CPUS + 1]; - static ssize_t tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; + char *mask_str; int len; - mutex_lock(&tracing_cpumask_update_lock); + len = snprintf(NULL, 0, "%*pb\n", + cpumask_pr_args(tr->tracing_cpumask)) + 1; + mask_str = kmalloc(len, GFP_KERNEL); + if (!mask_str) + return -ENOMEM; - len = snprintf(mask_str, count, "%*pb\n", + len = snprintf(mask_str, len, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)); if (len >= count) { count = -EINVAL; goto out_err; } - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); out_err: - mutex_unlock(&tracing_cpumask_update_lock); + kfree(mask_str); return count; } @@ -3490,8 +3504,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, if (err) goto err_unlock; - mutex_lock(&tracing_cpumask_update_lock); - local_irq_disable(); arch_spin_lock(&tr->max_lock); for_each_tracing_cpu(cpu) { @@ -3514,8 +3526,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, local_irq_enable(); cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); - - mutex_unlock(&tracing_cpumask_update_lock); free_cpumask_var(tracing_cpumask_new); return count; @@ -3979,10 +3989,15 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, { char buf[64]; int r; + unsigned int n; + preempt_disable(); arch_spin_lock(&trace_cmdline_lock); - r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); + n = savedcmd->cmdline_num; arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + + r = scnprintf(buf, sizeof(buf), "%u\n", n); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -3991,7 +4006,7 @@ static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) { kfree(s->saved_cmdlines); kfree(s->map_cmdline_to_pid); - kfree(s->saved_tgids); + kfree(s->map_cmdline_to_tgid); kfree(s); } @@ -4008,10 +4023,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val) return -ENOMEM; } + preempt_disable(); arch_spin_lock(&trace_cmdline_lock); savedcmd_temp = savedcmd; savedcmd = s; arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); free_saved_cmdlines_buffer(savedcmd_temp); return 0; @@ -4230,33 +4247,61 @@ tracing_saved_tgids_read(struct file *file, char __user *ubuf, char *file_buf; char *buf; int len = 0; - int pid; int i; + int *pids; + int n = 0; - file_buf = kmalloc(savedcmd->cmdline_num*(16+1+16), GFP_KERNEL); - if (!file_buf) - return -ENOMEM; + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); - buf = file_buf; + pids = kmalloc_array(savedcmd->cmdline_num, 2*sizeof(int), GFP_KERNEL); + if (!pids) { + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + return -ENOMEM; + } for (i = 0; i < savedcmd->cmdline_num; i++) { - int tgid; - int r; + int pid; pid = savedcmd->map_cmdline_to_pid[i]; if (pid == -1 || pid == NO_CMDLINE_MAP) continue; - tgid = trace_find_tgid(pid); - r = sprintf(buf, "%d %d\n", pid, tgid); + pids[n] = pid; + pids[n+1] = __find_tgid_locked(pid); + n += 2; + } + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + + if (n == 0) { + kfree(pids); + return 0; + } + + /* enough to hold max pair of pids + space, lr and nul */ + len = n * 12; + file_buf = kmalloc(len, GFP_KERNEL); + if (!file_buf) { + kfree(pids); + return -ENOMEM; + } + + buf = file_buf; + for (i = 0; i < n && len > 0; i += 2) { + int r; + + r = snprintf(buf, len, "%d %d\n", pids[i], pids[i+1]); buf += r; - len += r; + len -= r; } len = simple_read_from_buffer(ubuf, cnt, ppos, - file_buf, len); + file_buf, buf - file_buf); kfree(file_buf); + kfree(pids); return len; } @@ -4808,7 +4853,7 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (!tracing_is_on() && iter->pos) + if (!tracer_tracing_is_on(iter->tr) && iter->pos) break; mutex_unlock(&iter->mutex); @@ -5347,7 +5392,7 @@ static int tracing_set_clock(struct trace_array *tr, const char *clockstr) tracing_reset_online_cpus(&tr->trace_buffer); #ifdef CONFIG_TRACER_MAX_TRACE - if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) + if (tr->max_buffer.buffer) ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); tracing_reset_online_cpus(&tr->max_buffer); #endif @@ -5869,7 +5914,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; - int entries, size, i; + int entries, i; ssize_t ret = 0; #ifdef CONFIG_TRACER_MAX_TRACE @@ -5920,14 +5965,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; } - /* - * zero out any left over data, this is going to - * user land. - */ - size = ring_buffer_page_len(ref->page); - if (size < PAGE_SIZE) - memset(ref->page + size, 0, PAGE_SIZE - size); - page = virt_to_page(ref->page); spd.pages[i] = page; @@ -6654,6 +6691,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size buf->data = alloc_percpu(struct trace_array_cpu); if (!buf->data) { ring_buffer_free(buf->buffer); + buf->buffer = NULL; return -ENOMEM; } @@ -6677,7 +6715,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) allocate_snapshot ? size : 1); if (WARN_ON(ret)) { ring_buffer_free(tr->trace_buffer.buffer); + tr->trace_buffer.buffer = NULL; free_percpu(tr->trace_buffer.data); + tr->trace_buffer.data = NULL; return -ENOMEM; } tr->allocated_snapshot = allocate_snapshot; @@ -6847,6 +6887,7 @@ static int instance_rmdir(const char *name) } kfree(tr->topts); + free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 26960e49bb8c..1235f9fd9fbd 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2301,6 +2301,7 @@ void trace_event_enum_update(struct trace_enum_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; + bool first = false; int last_i; int i; @@ -2308,15 +2309,28 @@ void trace_event_enum_update(struct trace_enum_map **map, int len) list_for_each_entry_safe(call, p, &ftrace_events, list) { /* events are usually grouped together with systems */ if (!last_system || call->class->system != last_system) { + first = true; last_i = 0; last_system = call->class->system; } + /* + * Since calls are grouped by systems, the likelyhood that the + * next call in the iteration belongs to the same system as the + * previous call is high. As an optimization, we skip seaching + * for a map[] that matches the call's system if the last call + * was from the same system. That's what last_i is for. If the + * call has the same system as the previous call, then last_i + * will be the index of the first map[] that has a matching + * system. + */ for (i = last_i; i < len; i++) { if (call->class->system == map[i]->system) { /* Save the first system if need be */ - if (!last_i) + if (first) { last_i = i; + first = false; + } update_event_printk(call, map[i]); } } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 6816302542b2..f0e5408499b6 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1979,6 +1979,10 @@ static int create_filter(struct trace_event_call *call, if (err && set_str) append_filter_err(ps, filter); } + if (err && !set_str) { + free_event_filter(filter); + filter = NULL; + } create_filter_finish(ps); *filterp = filter; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 96c75b0e9831..a804ee1b3ec6 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -8,6 +8,7 @@ */ #include <linux/uaccess.h> #include <linux/ftrace.h> +#include <linux/interrupt.h> #include <linux/slab.h> #include <linux/fs.h> diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index be3222b7d72e..21b162c07e83 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -16,6 +16,10 @@ #include "trace.h" +#define CREATE_TRACE_POINTS +#include <trace/events/preemptirq.h> + +#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER) static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; @@ -451,63 +455,43 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) #else /* !CONFIG_PROVE_LOCKING */ /* - * Stubs: - */ - -void trace_softirqs_on(unsigned long ip) -{ -} - -void trace_softirqs_off(unsigned long ip) -{ -} - -inline void print_irqtrace_events(struct task_struct *curr) -{ -} - -/* * We are only interested in hardirq on/off events: */ -void trace_hardirqs_on(void) +static inline void tracer_hardirqs_on(void) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } -EXPORT_SYMBOL(trace_hardirqs_on); -void trace_hardirqs_off(void) +static inline void tracer_hardirqs_off(void) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } -EXPORT_SYMBOL(trace_hardirqs_off); -__visible void trace_hardirqs_on_caller(unsigned long caller_addr) +static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } -EXPORT_SYMBOL(trace_hardirqs_on_caller); -__visible void trace_hardirqs_off_caller(unsigned long caller_addr) +static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); } -EXPORT_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_PROVE_LOCKING */ #endif /* CONFIG_IRQSOFF_TRACER */ #ifdef CONFIG_PREEMPT_TRACER -void trace_preempt_on(unsigned long a0, unsigned long a1) +static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { if (preempt_trace() && !irq_trace()) stop_critical_timing(a0, a1); } -void trace_preempt_off(unsigned long a0, unsigned long a1) +static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { if (preempt_trace() && !irq_trace()) start_critical_timing(a0, a1); @@ -770,3 +754,100 @@ __init static int init_irqsoff_tracer(void) return 0; } core_initcall(init_irqsoff_tracer); +#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */ + +#ifndef CONFIG_IRQSOFF_TRACER +static inline void tracer_hardirqs_on(void) { } +static inline void tracer_hardirqs_off(void) { } +static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { } +static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { } +#endif + +#ifndef CONFIG_PREEMPT_TRACER +static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } +static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } +#endif + +/* Per-cpu variable to prevent redundant calls when IRQs already off */ +static DEFINE_PER_CPU(int, tracing_irq_cpu); + +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING) +void trace_hardirqs_on(void) +{ + if (!this_cpu_read(tracing_irq_cpu)) + return; + + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + tracer_hardirqs_on(); + + this_cpu_write(tracing_irq_cpu, 0); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void trace_hardirqs_off(void) +{ + if (this_cpu_read(tracing_irq_cpu)) + return; + + this_cpu_write(tracing_irq_cpu, 1); + + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); + tracer_hardirqs_off(); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +__visible void trace_hardirqs_on_caller(unsigned long caller_addr) +{ + if (!this_cpu_read(tracing_irq_cpu)) + return; + + trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); + tracer_hardirqs_on_caller(caller_addr); + + this_cpu_write(tracing_irq_cpu, 0); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +__visible void trace_hardirqs_off_caller(unsigned long caller_addr) +{ + if (this_cpu_read(tracing_irq_cpu)) + return; + + this_cpu_write(tracing_irq_cpu, 1); + + trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); + tracer_hardirqs_off_caller(caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +/* + * Stubs: + */ + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} +#endif + +#if defined(CONFIG_PREEMPT_TRACER) || \ + (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS)) +void trace_preempt_on(unsigned long a0, unsigned long a1) +{ + trace_preempt_enable_rcuidle(a0, a1); + tracer_preempt_on(a0, a1); +} + +void trace_preempt_off(unsigned long a0, unsigned long a1) +{ + trace_preempt_disable_rcuidle(a0, a1); + tracer_preempt_off(a0, a1); +} +#endif diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 12ea4ea619ee..e9092a0247bf 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -659,30 +659,25 @@ static int create_trace_kprobe(int argc, char **argv) pr_info("Probe point is not specified.\n"); return -EINVAL; } - if (isdigit(argv[1][0])) { - if (is_return) { - pr_info("Return probe point must be a symbol.\n"); - return -EINVAL; - } - /* an address specified */ - ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); - if (ret) { - pr_info("Failed to parse address.\n"); - return ret; - } - } else { + + /* try to parse an address. if that fails, try to read the + * input as a symbol. */ + if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { /* a symbol specified */ symbol = argv[1]; /* TODO: support .init module functions */ ret = traceprobe_split_symbol_offset(symbol, &offset); if (ret) { - pr_info("Failed to parse symbol.\n"); + pr_info("Failed to parse either an address or a symbol.\n"); return ret; } if (offset && is_return) { pr_info("Return probe must be used without offset.\n"); return -EINVAL; } + } else if (is_return) { + pr_info("Return probe point must be a symbol.\n"); + return -EINVAL; } argc -= 2; argv += 2; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index b0f86ea77881..ca70d11b8aa7 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -272,7 +272,7 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt) goto out_free; if (cnt > 1) { if (trace_selftest_test_global_cnt == 0) - goto out; + goto out_free; } if (trace_selftest_test_dyn_cnt == 0) goto out_free; diff --git a/kernel/uid16.c b/kernel/uid16.c index d58cc4d8f0d1..651aaa5221ec 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -190,6 +190,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 73c018d7df00..a719a4ad2e74 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -70,6 +70,7 @@ enum { * attach_mutex to avoid changing binding state while * worker_attach_to_pool() is in progress. */ + POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ /* worker flags */ @@ -167,7 +168,6 @@ struct worker_pool { /* L: hash of busy workers */ /* see manage_workers() for details on the two manager mutexes */ - struct mutex manager_arb; /* manager arbitration */ struct worker *manager; /* L: purely informational */ struct mutex attach_mutex; /* attach/detach exclusion */ struct list_head workers; /* A: attached workers */ @@ -299,6 +299,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ +static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */ static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ @@ -812,7 +813,7 @@ static bool need_to_create_worker(struct worker_pool *pool) /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { - bool managing = mutex_is_locked(&pool->manager_arb); + bool managing = pool->flags & POOL_MANAGER_ACTIVE; int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; @@ -1492,6 +1493,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; + WARN_ON_ONCE(!wq); WARN_ON_ONCE(timer->function != delayed_work_timer_fn || timer->data != (unsigned long)dwork); WARN_ON_ONCE(timer_pending(timer)); @@ -1964,24 +1966,17 @@ static bool manage_workers(struct worker *worker) { struct worker_pool *pool = worker->pool; - /* - * Anyone who successfully grabs manager_arb wins the arbitration - * and becomes the manager. mutex_trylock() on pool->manager_arb - * failure while holding pool->lock reliably indicates that someone - * else is managing the pool and the worker which failed trylock - * can proceed to executing work items. This means that anyone - * grabbing manager_arb is responsible for actually performing - * manager duties. If manager_arb is grabbed and released without - * actual management, the pool may stall indefinitely. - */ - if (!mutex_trylock(&pool->manager_arb)) + if (pool->flags & POOL_MANAGER_ACTIVE) return false; + + pool->flags |= POOL_MANAGER_ACTIVE; pool->manager = worker; maybe_create_worker(pool); pool->manager = NULL; - mutex_unlock(&pool->manager_arb); + pool->flags &= ~POOL_MANAGER_ACTIVE; + wake_up(&wq_manager_wait); return true; } @@ -3141,7 +3136,6 @@ static int init_worker_pool(struct worker_pool *pool) setup_timer(&pool->mayday_timer, pool_mayday_timeout, (unsigned long)pool); - mutex_init(&pool->manager_arb); mutex_init(&pool->attach_mutex); INIT_LIST_HEAD(&pool->workers); @@ -3211,13 +3205,15 @@ static void put_unbound_pool(struct worker_pool *pool) hash_del(&pool->hash_node); /* - * Become the manager and destroy all workers. Grabbing - * manager_arb prevents @pool's workers from blocking on - * attach_mutex. + * Become the manager and destroy all workers. This prevents + * @pool's workers from blocking on attach_mutex. We're the last + * manager and @pool gets freed with the flag set. */ - mutex_lock(&pool->manager_arb); - spin_lock_irq(&pool->lock); + wait_event_lock_irq(wq_manager_wait, + !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock); + pool->flags |= POOL_MANAGER_ACTIVE; + while ((worker = first_idle_worker(pool))) destroy_worker(worker); WARN_ON(pool->nr_workers || pool->nr_idle); @@ -3231,8 +3227,6 @@ static void put_unbound_pool(struct worker_pool *pool) if (pool->detach_completion) wait_for_completion(pool->detach_completion); - mutex_unlock(&pool->manager_arb); - /* shut down the timers */ del_timer_sync(&pool->idle_timer); del_timer_sync(&pool->mayday_timer); @@ -3669,8 +3663,12 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, return -EINVAL; /* creating multiple pwqs breaks ordering guarantee */ - if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) - return -EINVAL; + if (!list_empty(&wq->pwqs)) { + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) + return -EINVAL; + + wq->flags &= ~__WQ_ORDERED; + } ctx = apply_wqattrs_prepare(wq, attrs); @@ -3856,6 +3854,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct workqueue_struct *wq; struct pool_workqueue *pwq; + /* + * Unbound && max_active == 1 used to imply ordered, which is no + * longer the case on NUMA machines due to per-node pools. While + * alloc_ordered_workqueue() is the right way to create an ordered + * workqueue, keep the previous behavior to avoid subtle breakages + * on NUMA. + */ + if ((flags & WQ_UNBOUND) && max_active == 1) + flags |= __WQ_ORDERED; + /* see the comment above the definition of WQ_POWER_EFFICIENT */ if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) flags |= WQ_UNBOUND; @@ -4044,13 +4052,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) struct pool_workqueue *pwq; /* disallow meddling with max_active for ordered workqueues */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return; max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); mutex_lock(&wq->mutex); + wq->flags &= ~__WQ_ORDERED; wq->saved_max_active = max_active; for_each_pwq(pwq, wq) @@ -5178,7 +5187,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) * attributes breaks ordering guarantee. Disallow exposing ordered * workqueues. */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return -EINVAL; wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 45215870ac6c..3fa9c146fccb 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -9,6 +9,7 @@ #include <linux/workqueue.h> #include <linux/kthread.h> +#include <linux/preempt.h> struct worker_pool; @@ -59,7 +60,7 @@ struct worker { */ static inline struct worker *current_wq_worker(void) { - if (current->flags & PF_WQ_WORKER) + if (in_task() && (current->flags & PF_WQ_WORKER)) return kthread_data(current); return NULL; } |
