From a81d32264721fcb29e54d981fe17519cef600d78 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 11 Sep 2017 17:10:37 -0700 Subject: ANDROID: sched/rt: schedtune: Add boost retention to RT Boosted RT tasks can be deboosted quickly, this makes boost usless for RT tasks and causes lots of glitching. Use timers to prevent de-boost too soon and wait for long enough such that next enqueue happens after a threshold. While this can be solved in the governor, there are following advantages: - The approach used is governor-independent - Reduces boost group lock contention for frequently sleepers/wakers Note: Fixed build breakage due to schedfreq dependency which isn't used for RT anymore. Bug: 30210506 Change-Id: I428a2695cac06cc3458cdde0dea72315e4e66c00 Signed-off-by: Joel Fernandes --- kernel/sched/core.c | 1 + kernel/sched/rt.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 1 + 3 files changed, 150 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fcb7887dacea..6c9213a6fa9f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2200,6 +2200,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) init_dl_task_timer(&p->dl); __dl_clear_params(p); + init_rt_schedtune_timer(&p->rt); INIT_LIST_HEAD(&p->rt.run_list); #ifdef CONFIG_PREEMPT_NOTIFIERS diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0f21b12332df..d367aa11e511 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -7,6 +7,7 @@ #include #include +#include #include "walt.h" #include "tune.h" @@ -975,6 +976,70 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; } +#define RT_SCHEDTUNE_INTERVAL 50000000ULL + +static enum hrtimer_restart rt_schedtune_timer(struct hrtimer *timer) +{ + struct sched_rt_entity *rt_se = container_of(timer, + struct sched_rt_entity, + schedtune_timer); + struct task_struct *p = rt_task_of(rt_se); + struct rq *rq = task_rq(p); + + raw_spin_lock(&rq->lock); + + /* + * Nothing to do if: + * - task has switched runqueues + * - task isn't RT anymore + */ + if (rq != task_rq(p) || (p->sched_class != &rt_sched_class)) + goto out; + + /* + * If task got enqueued back during callback time, it means we raced + * with the enqueue on another cpu, that's Ok, just do nothing as + * enqueue path would have tried to cancel us and we shouldn't run + * Also check the schedtune_enqueued flag as class-switch on a + * sleeping task may have already canceled the timer and done dq + */ + if (p->on_rq || !rt_se->schedtune_enqueued) + goto out; + + /* + * RT task is no longer active, cancel boost + */ + rt_se->schedtune_enqueued = false; + schedtune_dequeue_task(p, cpu_of(rq)); + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); +out: + raw_spin_unlock(&rq->lock); + + /* + * This can free the task_struct if no more references. + */ + put_task_struct(p); + + return HRTIMER_NORESTART; +} + +void init_rt_schedtune_timer(struct sched_rt_entity *rt_se) +{ + struct hrtimer *timer = &rt_se->schedtune_timer; + + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + timer->function = rt_schedtune_timer; + rt_se->schedtune_enqueued = false; +} + +static void start_schedtune_timer(struct sched_rt_entity *rt_se) +{ + struct hrtimer *timer = &rt_se->schedtune_timer; + + hrtimer_start(timer, ns_to_ktime(RT_SCHEDTUNE_INTERVAL), + HRTIMER_MODE_REL_PINNED); +} + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -1312,7 +1377,32 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); + if (!schedtune_task_boost(p)) + return; + + /* + * If schedtune timer is active, that means a boost was already + * done, just cancel the timer so that deboost doesn't happen. + * Otherwise, increase the boost. If an enqueued timer was + * cancelled, put the task reference. + */ + if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1) + put_task_struct(p); + + /* + * schedtune_enqueued can be true in the following situation: + * enqueue_task_rt grabs rq lock before timer fires + * or before its callback acquires rq lock + * schedtune_enqueued can be false if timer callback is running + * and timer just released rq lock, or if the timer finished + * running and canceling the boost + */ + if (rt_se->schedtune_enqueued) + return; + + rt_se->schedtune_enqueued = true; schedtune_enqueue_task(p, cpu_of(rq)); + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) @@ -1324,7 +1414,19 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) walt_dec_cumulative_runnable_avg(rq, p); dequeue_pushable_task(rq, p); + + if (!rt_se->schedtune_enqueued) + return; + + if (flags == DEQUEUE_SLEEP) { + get_task_struct(p); + start_schedtune_timer(rt_se); + return; + } + + rt_se->schedtune_enqueued = false; schedtune_dequeue_task(p, cpu_of(rq)); + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); } /* @@ -1364,6 +1466,32 @@ static void yield_task_rt(struct rq *rq) #ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); +/* + * Perform a schedtune dequeue and cancelation of boost timers if needed. + * Should be called only with the rq->lock held. + */ +static void schedtune_dequeue_rt(struct rq *rq, struct task_struct *p) +{ + struct sched_rt_entity *rt_se = &p->rt; + + BUG_ON(!raw_spin_is_locked(&rq->lock)); + + if (!rt_se->schedtune_enqueued) + return; + + /* + * Incase of class change cancel any active timers. If an enqueued + * timer was cancelled, put the task ref. + */ + if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1) + put_task_struct(p); + + /* schedtune_enqueued is true, deboost it */ + rt_se->schedtune_enqueued = false; + schedtune_dequeue_task(p, task_cpu(p)); + cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT); +} + static int select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, int sibling_count_hint) @@ -1418,6 +1546,19 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, rcu_read_unlock(); out: + /* + * If previous CPU was different, make sure to cancel any active + * schedtune timers and deboost. + */ + if (task_cpu(p) != cpu) { + unsigned long fl; + struct rq *prq = task_rq(p); + + raw_spin_lock_irqsave(&prq->lock, fl); + schedtune_dequeue_rt(prq, p); + raw_spin_unlock_irqrestore(&prq->lock, fl); + } + return cpu; } @@ -2162,6 +2303,13 @@ static void rq_offline_rt(struct rq *rq) */ static void switched_from_rt(struct rq *rq, struct task_struct *p) { + /* + * On class switch from rt, always cancel active schedtune timers, + * this handles the cases where we switch class for a task that is + * already rt-dequeued but has a running timer. + */ + schedtune_dequeue_rt(rq, p); + /* * If there are other RT tasks then we will reschedule * and the scheduling of the other RT tasks will handle diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3db67eb07bdd..b9ba3279f095 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1408,6 +1408,7 @@ extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); +extern void init_rt_schedtune_timer(struct sched_rt_entity *rt_se); extern struct dl_bandwidth def_dl_bandwidth; extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); -- cgit v1.2.3 From 1367d854b97493bfb1f3d24cf89ba60cb7f059ea Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 Jan 2018 03:37:38 +0100 Subject: bpf: fix branch pruning logic [ Upstream commit c131187db2d3fa2f8bf32fdf4e9a4ef805168467 ] when the verifier detects that register contains a runtime constant and it's compared with another constant it will prune exploration of the branch that is guaranteed not to be taken at runtime. This is all correct, but malicious program may be constructed in such a way that it always has a constant comparison and the other branch is never taken under any conditions. In this case such path through the program will not be explored by the verifier. It won't be taken at run-time either, but since all instructions are JITed the malicious program may cause JITs to complain about using reserved fields, etc. To fix the issue we have to track the instructions explored by the verifier and sanitize instructions that are dead at run time with NOPs. We cannot reject such dead code, since llvm generates it for valid C code, since it doesn't do as much data flow analysis as the verifier does. Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 014c2d759916..a62679711de0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -191,6 +191,7 @@ struct bpf_insn_aux_data { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ }; + bool seen; /* this insn was processed by the verifier */ }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ @@ -1793,6 +1794,7 @@ static int do_check(struct verifier_env *env) print_bpf_insn(env, insn); } + env->insn_aux_data[insn_idx].seen = true; if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); if (err) @@ -1988,6 +1990,7 @@ process_bpf_exit: return err; insn_idx++; + env->insn_aux_data[insn_idx].seen = true; } else { verbose("invalid BPF_LD mode\n"); return -EINVAL; @@ -2125,6 +2128,7 @@ static int adjust_insn_aux_data(struct verifier_env *env, u32 prog_len, u32 off, u32 cnt) { struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; + int i; if (cnt == 1) return 0; @@ -2134,6 +2138,8 @@ static int adjust_insn_aux_data(struct verifier_env *env, u32 prog_len, memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); + for (i = off; i < off + cnt - 1; i++) + new_data[i].seen = true; env->insn_aux_data = new_data; vfree(old_data); return 0; @@ -2152,6 +2158,25 @@ static struct bpf_prog *bpf_patch_insn_data(struct verifier_env *env, u32 off, return new_prog; } +/* The verifier does more data flow analysis than llvm and will not explore + * branches that are dead at run time. Malicious programs can have dead code + * too. Therefore replace all dead at-run-time code with nops. + */ +static void sanitize_dead_code(struct verifier_env *env) +{ + struct bpf_insn_aux_data *aux_data = env->insn_aux_data; + struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); + struct bpf_insn *insn = env->prog->insnsi; + const int insn_cnt = env->prog->len; + int i; + + for (i = 0; i < insn_cnt; i++) { + if (aux_data[i].seen) + continue; + memcpy(insn + i, &nop, sizeof(nop)); + } +} + /* convert load instructions that access fields of 'struct __sk_buff' * into sequence of instructions that access fields of 'struct sk_buff' */ @@ -2370,6 +2395,9 @@ skip_full_check: while (pop_stack(env, NULL) >= 0); free_states(env); + if (ret == 0) + sanitize_dead_code(env); + if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); -- cgit v1.2.3 From 361fb0481247bea4da3eb122e685c8b72ef7c8a9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 Jan 2018 03:37:40 +0100 Subject: bpf: fix bpf_tail_call() x64 JIT [ upstream commit 90caccdd8cc0215705f18b92771b449b01e2474a ] - bpf prog_array just like all other types of bpf array accepts 32-bit index. Clarify that in the comment. - fix x64 JIT of bpf_tail_call which was incorrectly loading 8 instead of 4 bytes - tighten corresponding check in the interpreter to stay consistent The JIT bug can be triggered after introduction of BPF_F_NUMA_NODE flag in commit 96eabe7a40aa in 4.14. Before that the map_flags would stay zero and though JIT code is wrong it will check bounds correctly. Hence two fixes tags. All other JITs don't have this problem. Signed-off-by: Alexei Starovoitov Fixes: 96eabe7a40aa ("bpf: Allow selecting numa node during map creation") Fixes: b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper") Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3fd76cf0c21e..e54ea31c5103 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -517,7 +517,7 @@ select_insn: struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog; - u64 index = BPF_R3; + u32 index = BPF_R3; if (unlikely(index >= array->map.max_entries)) goto out; -- cgit v1.2.3 From 28c486744e6de4d882a1d853aa63d99fcba4b7a6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 Jan 2018 03:37:41 +0100 Subject: bpf: introduce BPF_JIT_ALWAYS_ON config [ upstream commit 290af86629b25ffd1ed6232c4e9107da031705cb ] The BPF interpreter has been used as part of the spectre 2 attack CVE-2017-5715. A quote from goolge project zero blog: "At this point, it would normally be necessary to locate gadgets in the host kernel code that can be used to actually leak data by reading from an attacker-controlled location, shifting and masking the result appropriately and then using the result of that as offset to an attacker-controlled address for a load. But piecing gadgets together and figuring out which ones work in a speculation context seems annoying. So instead, we decided to use the eBPF interpreter, which is built into the host kernel - while there is no legitimate way to invoke it from inside a VM, the presence of the code in the host kernel's text section is sufficient to make it usable for the attack, just like with ordinary ROP gadgets." To make attacker job harder introduce BPF_JIT_ALWAYS_ON config option that removes interpreter from the kernel in favor of JIT-only mode. So far eBPF JIT is supported by: x64, arm64, arm32, sparc64, s390, powerpc64, mips64 The start of JITed program is randomized and code page is marked as read-only. In addition "constant blinding" can be turned on with net.core.bpf_jit_harden v2->v3: - move __bpf_prog_ret0 under ifdef (Daniel) v1->v2: - fix init order, test_bpf and cBPF (Daniel's feedback) - fix offloaded bpf (Jakub's feedback) - add 'return 0' dummy in case something can invoke prog->bpf_func - retarget bpf tree. For bpf-next the patch would need one extra hunk. It will be sent when the trees are merged back to net-next Considered doing: int bpf_jit_enable __read_mostly = BPF_EBPF_JIT_DEFAULT; but it seems better to land the patch as-is and in bpf-next remove bpf_jit_enable global variable from all JITs, consolidate in one place and remove this jit_init() function. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/core.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e54ea31c5103..c40e25e63bb0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -256,6 +256,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +#ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context * @ctx: is the data we are operating on @@ -725,6 +726,13 @@ load_byte: return 0; } +#else +static unsigned int __bpf_prog_ret0(void *ctx, const struct bpf_insn *insn) +{ + return 0; +} +#endif + bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { @@ -771,9 +779,23 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ int bpf_prog_select_runtime(struct bpf_prog *fp) { +#ifndef CONFIG_BPF_JIT_ALWAYS_ON fp->bpf_func = (void *) __bpf_prog_run; - +#else + fp->bpf_func = (void *) __bpf_prog_ret0; +#endif + + /* eBPF JITs can rewrite the program in case constant + * blinding is active. However, in case of error during + * blinding, bpf_int_jit_compile() must always return a + * valid program, which in this case would simply not + * be JITed, but falls back to the interpreter. + */ bpf_int_jit_compile(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + if (!fp->jited) + return -ENOTSUPP; +#endif bpf_prog_lock_ro(fp); /* The tail call compatibility check can only be done at -- cgit v1.2.3 From 7dcda40e52ff0712a2d7d5353c1722cb1f994330 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 30 Jan 2018 03:37:42 +0100 Subject: bpf: arsh is not supported in 32 bit alu thus reject it [ upstream commit 7891a87efc7116590eaba57acc3c422487802c6f ] The following snippet was throwing an 'unknown opcode cc' warning in BPF interpreter: 0: (18) r0 = 0x0 2: (7b) *(u64 *)(r10 -16) = r0 3: (cc) (u32) r0 s>>= (u32) r0 4: (95) exit Although a number of JITs do support BPF_ALU | BPF_ARSH | BPF_{K,X} generation, not all of them do and interpreter does neither. We can leave existing ones and implement it later in bpf-next for the remaining ones, but reject this properly in verifier for the time being. Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") Reported-by: syzbot+93c4904c5c70348a6890@syzkaller.appspotmail.com Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a62679711de0..32af15136bf4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1165,6 +1165,11 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) { + verbose("BPF_ARSH not supported for 32 bit ALU\n"); + return -EINVAL; + } + if ((opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; -- cgit v1.2.3 From b72ba2a0d82447538c7c977ccb3f2b31b19b7767 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 30 Jan 2018 03:37:44 +0100 Subject: bpf: fix divides by zero [ upstream commit c366287ebd698ef5e3de300d90cd62ee9ee7373e ] Divides by zero are not nice, lets avoid them if possible. Also do_div() seems not needed when dealing with 32bit operands, but this seems a minor detail. Fixes: bd4cf0ed331a ("net: filter: rework/optimize internal BPF interpreter's instruction set") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c40e25e63bb0..eb52d11fdaa7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -444,7 +444,7 @@ select_insn: DST = tmp; CONT; ALU_MOD_X: - if (unlikely(SRC == 0)) + if (unlikely((u32)SRC == 0)) return 0; tmp = (u32) DST; DST = do_div(tmp, (u32) SRC); @@ -463,7 +463,7 @@ select_insn: DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - if (unlikely(SRC == 0)) + if (unlikely((u32)SRC == 0)) return 0; tmp = (u32) DST; do_div(tmp, (u32) SRC); -- cgit v1.2.3 From 02662601a231f8721930168ce71d84bcfb8d9a96 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 30 Jan 2018 03:37:45 +0100 Subject: bpf: fix 32-bit divide by zero [ upstream commit 68fda450a7df51cff9e5a4d4a4d9d0d5f2589153 ] due to some JITs doing if (src_reg == 0) check in 64-bit mode for div/mod operations mask upper 32-bits of src register before doing the check Fixes: 622582786c9e ("net: filter: x86: internal BPF JIT") Fixes: 7a12b5031c6b ("sparc64: Add eBPF JIT.") Reported-by: syzbot+48340bb518e88849e2e3@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 32af15136bf4..c9156bd430bc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2248,6 +2248,24 @@ static int fixup_bpf_calls(struct verifier_env *env) int i, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + /* due to JIT bugs clear upper 32-bits of src register + * before div/mod operation + */ + insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); + insn_buf[1] = *insn; + cnt = 2; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; -- cgit v1.2.3 From faa74a862a9442233bff39a496013a74775fb660 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 30 Jan 2018 03:37:46 +0100 Subject: bpf: reject stores into ctx via st and xadd [ upstream commit f37a8cb84cce18762e8f86a70bd6a49a66ab964c ] Alexei found that verifier does not reject stores into context via BPF_ST instead of BPF_STX. And while looking at it, we also should not allow XADD variant of BPF_STX. The context rewriter is only assuming either BPF_LDX_MEM- or BPF_STX_MEM-type operations, thus reject anything other than that so that assumptions in the rewriter properly hold. Add test cases as well for BPF selftests. Fixes: d691f9e8d440 ("bpf: allow programs to write to certain skb fields") Reported-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/verifier.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c9156bd430bc..c14003840bc5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -683,6 +683,13 @@ static bool is_pointer_value(struct verifier_env *env, int regno) } } +static bool is_ctx_reg(struct verifier_env *env, int regno) +{ + const struct reg_state *reg = &env->cur_state.regs[regno]; + + return reg->type == PTR_TO_CTX; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -779,6 +786,12 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) return -EACCES; } + if (is_ctx_reg(env, insn->dst_reg)) { + verbose("BPF_XADD stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check whether atomic_add can read the memory */ err = check_mem_access(env, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); @@ -1909,6 +1922,12 @@ static int do_check(struct verifier_env *env) if (err) return err; + if (is_ctx_reg(env, insn->dst_reg)) { + verbose("BPF_ST stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -- cgit v1.2.3