summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c30
-rw-r--r--kernel/bpf/verifier.c70
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/sched/rt.c148
-rw-r--r--kernel/sched/sched.h1
5 files changed, 246 insertions, 4 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3fd76cf0c21e..eb52d11fdaa7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -256,6 +256,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
}
EXPORT_SYMBOL_GPL(__bpf_call_base);
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
/**
* __bpf_prog_run - run eBPF program on a given context
* @ctx: is the data we are operating on
@@ -443,7 +444,7 @@ select_insn:
DST = tmp;
CONT;
ALU_MOD_X:
- if (unlikely(SRC == 0))
+ if (unlikely((u32)SRC == 0))
return 0;
tmp = (u32) DST;
DST = do_div(tmp, (u32) SRC);
@@ -462,7 +463,7 @@ select_insn:
DST = div64_u64(DST, SRC);
CONT;
ALU_DIV_X:
- if (unlikely(SRC == 0))
+ if (unlikely((u32)SRC == 0))
return 0;
tmp = (u32) DST;
do_div(tmp, (u32) SRC);
@@ -517,7 +518,7 @@ select_insn:
struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog;
- u64 index = BPF_R3;
+ u32 index = BPF_R3;
if (unlikely(index >= array->map.max_entries))
goto out;
@@ -725,6 +726,13 @@ load_byte:
return 0;
}
+#else
+static unsigned int __bpf_prog_ret0(void *ctx, const struct bpf_insn *insn)
+{
+ return 0;
+}
+#endif
+
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
{
@@ -771,9 +779,23 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
*/
int bpf_prog_select_runtime(struct bpf_prog *fp)
{
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
fp->bpf_func = (void *) __bpf_prog_run;
-
+#else
+ fp->bpf_func = (void *) __bpf_prog_ret0;
+#endif
+
+ /* eBPF JITs can rewrite the program in case constant
+ * blinding is active. However, in case of error during
+ * blinding, bpf_int_jit_compile() must always return a
+ * valid program, which in this case would simply not
+ * be JITed, but falls back to the interpreter.
+ */
bpf_int_jit_compile(fp);
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+ if (!fp->jited)
+ return -ENOTSUPP;
+#endif
bpf_prog_lock_ro(fp);
/* The tail call compatibility check can only be done at
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 014c2d759916..c14003840bc5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -191,6 +191,7 @@ struct bpf_insn_aux_data {
enum bpf_reg_type ptr_type; /* pointer type for load/store insns */
struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */
};
+ bool seen; /* this insn was processed by the verifier */
};
#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
@@ -682,6 +683,13 @@ static bool is_pointer_value(struct verifier_env *env, int regno)
}
}
+static bool is_ctx_reg(struct verifier_env *env, int regno)
+{
+ const struct reg_state *reg = &env->cur_state.regs[regno];
+
+ return reg->type == PTR_TO_CTX;
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -778,6 +786,12 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
return -EACCES;
}
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose("BPF_XADD stores into R%d context is not allowed\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check whether atomic_add can read the memory */
err = check_mem_access(env, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1);
@@ -1164,6 +1178,11 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
+ if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
+ verbose("BPF_ARSH not supported for 32 bit ALU\n");
+ return -EINVAL;
+ }
+
if ((opcode == BPF_LSH || opcode == BPF_RSH ||
opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
@@ -1793,6 +1812,7 @@ static int do_check(struct verifier_env *env)
print_bpf_insn(env, insn);
}
+ env->insn_aux_data[insn_idx].seen = true;
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
if (err)
@@ -1902,6 +1922,12 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose("BPF_ST stores into R%d context is not allowed\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
@@ -1988,6 +2014,7 @@ process_bpf_exit:
return err;
insn_idx++;
+ env->insn_aux_data[insn_idx].seen = true;
} else {
verbose("invalid BPF_LD mode\n");
return -EINVAL;
@@ -2125,6 +2152,7 @@ static int adjust_insn_aux_data(struct verifier_env *env, u32 prog_len,
u32 off, u32 cnt)
{
struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+ int i;
if (cnt == 1)
return 0;
@@ -2134,6 +2162,8 @@ static int adjust_insn_aux_data(struct verifier_env *env, u32 prog_len,
memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ for (i = off; i < off + cnt - 1; i++)
+ new_data[i].seen = true;
env->insn_aux_data = new_data;
vfree(old_data);
return 0;
@@ -2152,6 +2182,25 @@ static struct bpf_prog *bpf_patch_insn_data(struct verifier_env *env, u32 off,
return new_prog;
}
+/* The verifier does more data flow analysis than llvm and will not explore
+ * branches that are dead at run time. Malicious programs can have dead code
+ * too. Therefore replace all dead at-run-time code with nops.
+ */
+static void sanitize_dead_code(struct verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (aux_data[i].seen)
+ continue;
+ memcpy(insn + i, &nop, sizeof(nop));
+ }
+}
+
/* convert load instructions that access fields of 'struct __sk_buff'
* into sequence of instructions that access fields of 'struct sk_buff'
*/
@@ -2218,6 +2267,24 @@ static int fixup_bpf_calls(struct verifier_env *env)
int i, cnt, delta = 0;
for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+ /* due to JIT bugs clear upper 32-bits of src register
+ * before div/mod operation
+ */
+ insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg);
+ insn_buf[1] = *insn;
+ cnt = 2;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
if (insn->code != (BPF_JMP | BPF_CALL))
continue;
@@ -2371,6 +2438,9 @@ skip_full_check:
free_states(env);
if (ret == 0)
+ sanitize_dead_code(env);
+
+ if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 59ba1d1266a7..41a8fdb1436c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2368,6 +2368,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
init_dl_task_timer(&p->dl);
__dl_clear_params(p);
+ init_rt_schedtune_timer(&p->rt);
INIT_LIST_HEAD(&p->rt.run_list);
p->rt.timeout = 0;
p->rt.time_slice = sched_rr_timeslice;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8025828ff4e0..baad64b94f15 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,7 @@
#include <linux/slab.h>
#include <linux/irq_work.h>
#include <trace/events/sched.h>
+#include <linux/hrtimer.h>
#include "tune.h"
@@ -980,6 +981,70 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
return 0;
}
+#define RT_SCHEDTUNE_INTERVAL 50000000ULL
+
+static enum hrtimer_restart rt_schedtune_timer(struct hrtimer *timer)
+{
+ struct sched_rt_entity *rt_se = container_of(timer,
+ struct sched_rt_entity,
+ schedtune_timer);
+ struct task_struct *p = rt_task_of(rt_se);
+ struct rq *rq = task_rq(p);
+
+ raw_spin_lock(&rq->lock);
+
+ /*
+ * Nothing to do if:
+ * - task has switched runqueues
+ * - task isn't RT anymore
+ */
+ if (rq != task_rq(p) || (p->sched_class != &rt_sched_class))
+ goto out;
+
+ /*
+ * If task got enqueued back during callback time, it means we raced
+ * with the enqueue on another cpu, that's Ok, just do nothing as
+ * enqueue path would have tried to cancel us and we shouldn't run
+ * Also check the schedtune_enqueued flag as class-switch on a
+ * sleeping task may have already canceled the timer and done dq
+ */
+ if (p->on_rq || !rt_se->schedtune_enqueued)
+ goto out;
+
+ /*
+ * RT task is no longer active, cancel boost
+ */
+ rt_se->schedtune_enqueued = false;
+ schedtune_dequeue_task(p, cpu_of(rq));
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+out:
+ raw_spin_unlock(&rq->lock);
+
+ /*
+ * This can free the task_struct if no more references.
+ */
+ put_task_struct(p);
+
+ return HRTIMER_NORESTART;
+}
+
+void init_rt_schedtune_timer(struct sched_rt_entity *rt_se)
+{
+ struct hrtimer *timer = &rt_se->schedtune_timer;
+
+ hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ timer->function = rt_schedtune_timer;
+ rt_se->schedtune_enqueued = false;
+}
+
+static void start_schedtune_timer(struct sched_rt_entity *rt_se)
+{
+ struct hrtimer *timer = &rt_se->schedtune_timer;
+
+ hrtimer_start(timer, ns_to_ktime(RT_SCHEDTUNE_INTERVAL),
+ HRTIMER_MODE_REL_PINNED);
+}
+
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -1386,7 +1451,32 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+ if (!schedtune_task_boost(p))
+ return;
+
+ /*
+ * If schedtune timer is active, that means a boost was already
+ * done, just cancel the timer so that deboost doesn't happen.
+ * Otherwise, increase the boost. If an enqueued timer was
+ * cancelled, put the task reference.
+ */
+ if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
+ put_task_struct(p);
+
+ /*
+ * schedtune_enqueued can be true in the following situation:
+ * enqueue_task_rt grabs rq lock before timer fires
+ * or before its callback acquires rq lock
+ * schedtune_enqueued can be false if timer callback is running
+ * and timer just released rq lock, or if the timer finished
+ * running and canceling the boost
+ */
+ if (rt_se->schedtune_enqueued)
+ return;
+
+ rt_se->schedtune_enqueued = true;
schedtune_enqueue_task(p, cpu_of(rq));
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -1398,7 +1488,19 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
dec_hmp_sched_stats_rt(rq, p);
dequeue_pushable_task(rq, p);
+
+ if (!rt_se->schedtune_enqueued)
+ return;
+
+ if (flags == DEQUEUE_SLEEP) {
+ get_task_struct(p);
+ start_schedtune_timer(rt_se);
+ return;
+ }
+
+ rt_se->schedtune_enqueued = false;
schedtune_dequeue_task(p, cpu_of(rq));
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
}
/*
@@ -1472,6 +1574,32 @@ task_may_not_preempt(struct task_struct *task, int cpu)
task_thread_info(task)->preempt_count & SOFTIRQ_MASK));
}
+/*
+ * Perform a schedtune dequeue and cancelation of boost timers if needed.
+ * Should be called only with the rq->lock held.
+ */
+static void schedtune_dequeue_rt(struct rq *rq, struct task_struct *p)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+
+ BUG_ON(!raw_spin_is_locked(&rq->lock));
+
+ if (!rt_se->schedtune_enqueued)
+ return;
+
+ /*
+ * Incase of class change cancel any active timers. If an enqueued
+ * timer was cancelled, put the task ref.
+ */
+ if (hrtimer_try_to_cancel(&rt_se->schedtune_timer) == 1)
+ put_task_struct(p);
+
+ /* schedtune_enqueued is true, deboost it */
+ rt_se->schedtune_enqueued = false;
+ schedtune_dequeue_task(p, task_cpu(p));
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+}
+
static int
select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
int sibling_count_hint)
@@ -1546,6 +1674,19 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
rcu_read_unlock();
out:
+ /*
+ * If previous CPU was different, make sure to cancel any active
+ * schedtune timers and deboost.
+ */
+ if (task_cpu(p) != cpu) {
+ unsigned long fl;
+ struct rq *prq = task_rq(p);
+
+ raw_spin_lock_irqsave(&prq->lock, fl);
+ schedtune_dequeue_rt(prq, p);
+ raw_spin_unlock_irqrestore(&prq->lock, fl);
+ }
+
return cpu;
}
@@ -2402,6 +2543,13 @@ static void rq_offline_rt(struct rq *rq)
static void switched_from_rt(struct rq *rq, struct task_struct *p)
{
/*
+ * On class switch from rt, always cancel active schedtune timers,
+ * this handles the cases where we switch class for a task that is
+ * already rt-dequeued but has a running timer.
+ */
+ schedtune_dequeue_rt(rq, p);
+
+ /*
* If there are other RT tasks then we will reschedule
* and the scheduling of the other RT tasks will handle
* the balancing. But if we are the last RT task
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ca2294d06f44..6ef3a59612a9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2199,6 +2199,7 @@ extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+extern void init_rt_schedtune_timer(struct sched_rt_entity *rt_se);
extern struct dl_bandwidth def_dl_bandwidth;
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);