summaryrefslogtreecommitdiff
path: root/kernel/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/arraymap.c43
-rw-r--r--kernel/bpf/core.c101
-rw-r--r--kernel/bpf/hashtab.c9
-rw-r--r--kernel/bpf/syscall.c89
-rw-r--r--kernel/bpf/verifier.c415
5 files changed, 480 insertions, 177 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index b0799bced518..daa4e0782cf7 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -20,8 +20,10 @@
/* Called from syscall */
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
+ u32 elem_size, array_size, index_mask, max_entries;
+ bool unpriv = !capable(CAP_SYS_ADMIN);
struct bpf_array *array;
- u32 elem_size, array_size;
+ u64 mask64;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -36,12 +38,33 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
elem_size = round_up(attr->value_size, 8);
+ max_entries = attr->max_entries;
+
+ /* On 32 bit archs roundup_pow_of_two() with max_entries that has
+ * upper most bit set in u32 space is undefined behavior due to
+ * resulting 1U << 32, so do it manually here in u64 space.
+ */
+ mask64 = fls_long(max_entries - 1);
+ mask64 = 1ULL << mask64;
+ mask64 -= 1;
+
+ index_mask = mask64;
+ if (unpriv) {
+ /* round up array size to nearest power of 2,
+ * since cpu will speculate within index_mask limits
+ */
+ max_entries = index_mask + 1;
+ /* Check for overflows. */
+ if (max_entries < attr->max_entries)
+ return ERR_PTR(-E2BIG);
+ }
+
/* check round_up into zero and u32 overflow */
if (elem_size == 0 ||
- attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size)
+ max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size)
return ERR_PTR(-ENOMEM);
- array_size = sizeof(*array) + attr->max_entries * elem_size;
+ array_size = sizeof(*array) + max_entries * elem_size;
/* allocate all map elements and zero-initialize them */
array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
@@ -50,6 +73,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
if (!array)
return ERR_PTR(-ENOMEM);
}
+ array->index_mask = index_mask;
+ array->map.unpriv_array = unpriv;
/* copy mandatory map attributes */
array->map.key_size = attr->key_size;
@@ -70,14 +95,14 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
if (index >= array->map.max_entries)
return NULL;
- return array->value + array->elem_size * index;
+ return array->value + array->elem_size * (index & array->index_mask);
}
/* Called from syscall */
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- u32 index = *(u32 *)key;
+ u32 index = key ? *(u32 *)key : U32_MAX;
u32 *next = (u32 *)next_key;
if (index >= array->map.max_entries) {
@@ -111,7 +136,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
/* all elements already exist */
return -EEXIST;
- memcpy(array->value + array->elem_size * index, value, map->value_size);
+ memcpy(array->value +
+ array->elem_size * (index & array->index_mask),
+ value, map->value_size);
return 0;
}
@@ -243,9 +270,7 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
static void prog_fd_array_put_ptr(void *ptr)
{
- struct bpf_prog *prog = ptr;
-
- bpf_prog_put_rcu(prog);
+ bpf_prog_put(ptr);
}
/* decrement refcnt of all bpf_progs that are stored in this map */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 334b1bdd572c..eb52d11fdaa7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -137,6 +137,77 @@ void __bpf_prog_free(struct bpf_prog *fp)
}
EXPORT_SYMBOL_GPL(__bpf_prog_free);
+static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_JMP &&
+ /* Call and Exit are both special jumps with no
+ * target inside the BPF instruction image.
+ */
+ BPF_OP(insn->code) != BPF_CALL &&
+ BPF_OP(insn->code) != BPF_EXIT;
+}
+
+static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
+{
+ struct bpf_insn *insn = prog->insnsi;
+ u32 i, insn_cnt = prog->len;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (!bpf_is_jmp_and_has_target(insn))
+ continue;
+
+ /* Adjust offset of jmps if we cross boundaries. */
+ if (i < pos && i + insn->off + 1 > pos)
+ insn->off += delta;
+ else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
+ insn->off -= delta;
+ }
+}
+
+struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
+ const struct bpf_insn *patch, u32 len)
+{
+ u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
+ struct bpf_prog *prog_adj;
+
+ /* Since our patchlet doesn't expand the image, we're done. */
+ if (insn_delta == 0) {
+ memcpy(prog->insnsi + off, patch, sizeof(*patch));
+ return prog;
+ }
+
+ insn_adj_cnt = prog->len + insn_delta;
+
+ /* Several new instructions need to be inserted. Make room
+ * for them. Likely, there's no need for a new allocation as
+ * last page could have large enough tailroom.
+ */
+ prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
+ GFP_USER);
+ if (!prog_adj)
+ return NULL;
+
+ prog_adj->len = insn_adj_cnt;
+
+ /* Patching happens in 3 steps:
+ *
+ * 1) Move over tail of insnsi from next instruction onwards,
+ * so we can patch the single target insn with one or more
+ * new ones (patching is always from 1 to n insns, n > 0).
+ * 2) Inject new instructions at the target location.
+ * 3) Adjust branch offsets if necessary.
+ */
+ insn_rest = insn_adj_cnt - off - len;
+
+ memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
+ sizeof(*patch) * insn_rest);
+ memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
+
+ bpf_adj_branches(prog_adj, off, insn_delta);
+
+ return prog_adj;
+}
+
#ifdef CONFIG_BPF_JIT
struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
@@ -185,6 +256,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
}
EXPORT_SYMBOL_GPL(__bpf_call_base);
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
/**
* __bpf_prog_run - run eBPF program on a given context
* @ctx: is the data we are operating on
@@ -372,7 +444,7 @@ select_insn:
DST = tmp;
CONT;
ALU_MOD_X:
- if (unlikely(SRC == 0))
+ if (unlikely((u32)SRC == 0))
return 0;
tmp = (u32) DST;
DST = do_div(tmp, (u32) SRC);
@@ -391,7 +463,7 @@ select_insn:
DST = div64_u64(DST, SRC);
CONT;
ALU_DIV_X:
- if (unlikely(SRC == 0))
+ if (unlikely((u32)SRC == 0))
return 0;
tmp = (u32) DST;
do_div(tmp, (u32) SRC);
@@ -446,7 +518,7 @@ select_insn:
struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog;
- u64 index = BPF_R3;
+ u32 index = BPF_R3;
if (unlikely(index >= array->map.max_entries))
goto out;
@@ -654,6 +726,13 @@ load_byte:
return 0;
}
+#else
+static unsigned int __bpf_prog_ret0(void *ctx, const struct bpf_insn *insn)
+{
+ return 0;
+}
+#endif
+
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
{
@@ -700,9 +779,23 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
*/
int bpf_prog_select_runtime(struct bpf_prog *fp)
{
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
fp->bpf_func = (void *) __bpf_prog_run;
-
+#else
+ fp->bpf_func = (void *) __bpf_prog_ret0;
+#endif
+
+ /* eBPF JITs can rewrite the program in case constant
+ * blinding is active. However, in case of error during
+ * blinding, bpf_int_jit_compile() must always return a
+ * valid program, which in this case would simply not
+ * be JITed, but falls back to the interpreter.
+ */
bpf_int_jit_compile(fp);
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+ if (!fp->jited)
+ return -ENOTSUPP;
+#endif
bpf_prog_lock_ro(fp);
/* The tail call compatibility check can only be done at
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 34777b3746fa..a35abe048239 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -169,12 +169,15 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
struct hlist_head *head;
struct htab_elem *l, *next_l;
u32 hash, key_size;
- int i;
+ int i = 0;
WARN_ON_ONCE(!rcu_read_lock_held());
key_size = map->key_size;
+ if (!key)
+ goto find_first_elem;
+
hash = htab_map_hash(key, key_size);
head = select_bucket(htab, hash);
@@ -182,10 +185,8 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
/* lookup the key */
l = lookup_elem_raw(head, hash, key, key_size);
- if (!l) {
- i = 0;
+ if (!l)
goto find_first_elem;
- }
/* key was found, get next key in the same bucket */
next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4e32cc94edd9..04fc1022ad9f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -390,14 +390,18 @@ static int map_get_next_key(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
- goto err_put;
-
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ if (ukey) {
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+ } else {
+ key = NULL;
+ }
err = -ENOMEM;
next_key = kmalloc(map->key_size, GFP_USER);
@@ -447,57 +451,6 @@ void bpf_register_prog_type(struct bpf_prog_type_list *tl)
list_add(&tl->list_node, &bpf_prog_types);
}
-/* fixup insn->imm field of bpf_call instructions:
- * if (insn->imm == BPF_FUNC_map_lookup_elem)
- * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
- * else if (insn->imm == BPF_FUNC_map_update_elem)
- * insn->imm = bpf_map_update_elem - __bpf_call_base;
- * else ...
- *
- * this function is called after eBPF program passed verification
- */
-static void fixup_bpf_calls(struct bpf_prog *prog)
-{
- const struct bpf_func_proto *fn;
- int i;
-
- for (i = 0; i < prog->len; i++) {
- struct bpf_insn *insn = &prog->insnsi[i];
-
- if (insn->code == (BPF_JMP | BPF_CALL)) {
- /* we reach here when program has bpf_call instructions
- * and it passed bpf_check(), means that
- * ops->get_func_proto must have been supplied, check it
- */
- BUG_ON(!prog->aux->ops->get_func_proto);
-
- if (insn->imm == BPF_FUNC_get_route_realm)
- prog->dst_needed = 1;
- if (insn->imm == BPF_FUNC_get_prandom_u32)
- bpf_user_rnd_init_once();
- if (insn->imm == BPF_FUNC_tail_call) {
- /* mark bpf_tail_call as different opcode
- * to avoid conditional branch in
- * interpeter for every normal call
- * and to prevent accidental JITing by
- * JIT compiler that doesn't support
- * bpf_tail_call yet
- */
- insn->imm = 0;
- insn->code |= BPF_X;
- continue;
- }
-
- fn = prog->aux->ops->get_func_proto(insn->imm);
- /* all functions that have prototype and verifier allowed
- * programs to call them, must be real in-kernel functions
- */
- BUG_ON(!fn->func);
- insn->imm = fn->func - __bpf_call_base;
- }
- }
-}
-
/* drop refcnt on maps used by eBPF program and free auxilary data */
static void free_used_maps(struct bpf_prog_aux *aux)
{
@@ -534,7 +487,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
free_uid(user);
}
-static void __prog_put_common(struct rcu_head *rcu)
+static void __bpf_prog_put_rcu(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
@@ -543,17 +496,10 @@ static void __prog_put_common(struct rcu_head *rcu)
bpf_prog_free(aux->prog);
}
-/* version of bpf_prog_put() that is called after a grace period */
-void bpf_prog_put_rcu(struct bpf_prog *prog)
-{
- if (atomic_dec_and_test(&prog->aux->refcnt))
- call_rcu(&prog->aux->rcu, __prog_put_common);
-}
-
void bpf_prog_put(struct bpf_prog *prog)
{
if (atomic_dec_and_test(&prog->aux->refcnt))
- __prog_put_common(&prog->aux->rcu);
+ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
@@ -561,7 +507,7 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
{
struct bpf_prog *prog = filp->private_data;
- bpf_prog_put_rcu(prog);
+ bpf_prog_put(prog);
return 0;
}
@@ -680,9 +626,6 @@ static int bpf_prog_load(union bpf_attr *attr)
if (err < 0)
goto free_used_maps;
- /* fixup BPF_CALL->imm field */
- fixup_bpf_calls(prog);
-
/* eBPF program is ready to be JITed */
err = bpf_prog_select_runtime(prog);
if (err < 0)
@@ -727,7 +670,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
union bpf_attr attr = {};
int err;
- if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
+ if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
return -EPERM;
if (!access_ok(VERIFY_READ, uattr, 1))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2cbfba78d3db..c43ca9857479 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -186,6 +186,15 @@ struct verifier_stack_elem {
struct verifier_stack_elem *next;
};
+struct bpf_insn_aux_data {
+ union {
+ enum bpf_reg_type ptr_type; /* pointer type for load/store insns */
+ struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */
+ };
+ int sanitize_stack_off; /* stack slot to be cleared */
+ bool seen; /* this insn was processed by the verifier */
+};
+
#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
/* single container for all structs
@@ -200,6 +209,7 @@ struct verifier_env {
struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
u32 used_map_cnt; /* number of used maps */
bool allow_ptr_leaks;
+ struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
};
/* verbose verifier prints what it's seeing
@@ -313,7 +323,8 @@ static const char *const bpf_jmp_string[16] = {
[BPF_EXIT >> 4] = "exit",
};
-static void print_bpf_insn(struct bpf_insn *insn)
+static void print_bpf_insn(const struct verifier_env *env,
+ const struct bpf_insn *insn)
{
u8 class = BPF_CLASS(insn->code);
@@ -377,9 +388,19 @@ static void print_bpf_insn(struct bpf_insn *insn)
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->src_reg, insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IMM) {
- verbose("(%02x) r%d = 0x%x\n",
- insn->code, insn->dst_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM &&
+ BPF_SIZE(insn->code) == BPF_DW) {
+ /* At this point, we already made sure that the second
+ * part of the ldimm64 insn is accessible.
+ */
+ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+ bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+ if (map_ptr && !env->allow_ptr_leaks)
+ imm = 0;
+
+ verbose("(%02x) r%d = 0x%llx\n", insn->code,
+ insn->dst_reg, (unsigned long long)imm);
} else {
verbose("BUG_ld_%02x\n", insn->code);
return;
@@ -549,10 +570,11 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
/* check_stack_read/write functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
-static int check_stack_write(struct verifier_state *state, int off, int size,
- int value_regno)
+static int check_stack_write(struct verifier_env *env,
+ struct verifier_state *state, int off,
+ int size, int value_regno, int insn_idx)
{
- int i;
+ int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
* so it's aligned access and [off, off + size) are within stack limits
*/
@@ -567,15 +589,37 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
}
/* save register state */
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
- state->regs[value_regno];
-
- for (i = 0; i < BPF_REG_SIZE; i++)
+ state->spilled_regs[spi] = state->regs[value_regno];
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ if (state->stack_slot_type[MAX_BPF_STACK + off + i] == STACK_MISC &&
+ !env->allow_ptr_leaks) {
+ int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
+ int soff = (-spi - 1) * BPF_REG_SIZE;
+
+ /* detected reuse of integer stack slot with a pointer
+ * which means either llvm is reusing stack slot or
+ * an attacker is trying to exploit CVE-2018-3639
+ * (speculative store bypass)
+ * Have to sanitize that slot with preemptive
+ * store of zero.
+ */
+ if (*poff && *poff != soff) {
+ /* disallow programs where single insn stores
+ * into two different stack slots, since verifier
+ * cannot sanitize them
+ */
+ verbose("insn %d cannot access two stack slots fp%d and fp%d",
+ insn_idx, *poff, soff);
+ return -EINVAL;
+ }
+ *poff = soff;
+ }
state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+ }
} else {
/* regular write of data into stack */
- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
- (struct reg_state) {};
+ state->spilled_regs[spi] = (struct reg_state) {};
for (i = 0; i < size; i++)
state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
@@ -663,13 +707,20 @@ static bool is_pointer_value(struct verifier_env *env, int regno)
}
}
+static bool is_ctx_reg(struct verifier_env *env, int regno)
+{
+ const struct reg_state *reg = &env->cur_state.regs[regno];
+
+ return reg->type == PTR_TO_CTX;
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
* if t==write && value_regno==-1, some unknown value is stored into memory
* if t==read && value_regno==-1, don't care what we read from memory
*/
-static int check_mem_access(struct verifier_env *env, u32 regno, int off,
+static int check_mem_access(struct verifier_env *env, int insn_idx, u32 regno, int off,
int bpf_size, enum bpf_access_type t,
int value_regno)
{
@@ -721,7 +772,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
verbose("attempt to corrupt spilled pointer on stack\n");
return -EACCES;
}
- err = check_stack_write(state, off, size, value_regno);
+ err = check_stack_write(env, state, off, size,
+ value_regno, insn_idx);
} else {
err = check_stack_read(state, off, size, value_regno);
}
@@ -733,7 +785,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
return err;
}
-static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
+static int check_xadd(struct verifier_env *env, int insn_idx, struct bpf_insn *insn)
{
struct reg_state *regs = env->cur_state.regs;
int err;
@@ -754,14 +806,25 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d leaks addr into mem\n", insn->src_reg);
+ return -EACCES;
+ }
+
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose("BPF_XADD stores into R%d context is not allowed\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check whether atomic_add can read the memory */
- err = check_mem_access(env, insn->dst_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1);
if (err)
return err;
/* check whether atomic_add can write into the same memory */
- return check_mem_access(env, insn->dst_reg, insn->off,
+ return check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE, -1);
}
@@ -929,7 +992,7 @@ error:
return -EINVAL;
}
-static int check_call(struct verifier_env *env, int func_id)
+static int check_call(struct verifier_env *env, int func_id, int insn_idx)
{
struct verifier_state *state = &env->cur_state;
const struct bpf_func_proto *fn = NULL;
@@ -965,6 +1028,13 @@ static int check_call(struct verifier_env *env, int func_id)
err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map);
if (err)
return err;
+ if (func_id == BPF_FUNC_tail_call) {
+ if (map == NULL) {
+ verbose("verifier bug\n");
+ return -EINVAL;
+ }
+ env->insn_aux_data[insn_idx].map_ptr = map;
+ }
err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map);
if (err)
return err;
@@ -1028,7 +1098,8 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
}
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
- (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
+ (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
+ BPF_CLASS(insn->code) == BPF_ALU64) {
verbose("BPF_END uses reserved fields\n");
return -EINVAL;
}
@@ -1089,7 +1160,8 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
regs[insn->dst_reg].type = UNKNOWN_VALUE;
regs[insn->dst_reg].map_ptr = NULL;
}
- } else {
+ } else if (BPF_CLASS(insn->code) == BPF_ALU64 ||
+ insn->imm >= 0) {
/* case: R = imm
* remember the value we stored into this reg
*/
@@ -1132,6 +1204,11 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
return -EINVAL;
}
+ if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
+ verbose("BPF_ARSH not supported for 32 bit ALU\n");
+ return -EINVAL;
+ }
+
if ((opcode == BPF_LSH || opcode == BPF_RSH ||
opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
@@ -1758,16 +1835,17 @@ static int do_check(struct verifier_env *env)
if (log_level) {
verbose("%d: ", insn_idx);
- print_bpf_insn(insn);
+ print_bpf_insn(env, insn);
}
+ env->insn_aux_data[insn_idx].seen = true;
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
if (err)
return err;
} else if (class == BPF_LDX) {
- enum bpf_reg_type src_reg_type;
+ enum bpf_reg_type *prev_src_type, src_reg_type;
/* check for reserved fields is already done */
@@ -1785,27 +1863,30 @@ static int do_check(struct verifier_env *env)
/* check that memory (src_reg + off) is readable,
* the state of dst_reg will be updated by this func
*/
- err = check_mem_access(env, insn->src_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->src_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ,
insn->dst_reg);
if (err)
return err;
- if (BPF_SIZE(insn->code) != BPF_W) {
+ if (BPF_SIZE(insn->code) != BPF_W &&
+ BPF_SIZE(insn->code) != BPF_DW) {
insn_idx++;
continue;
}
- if (insn->imm == 0) {
+ prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
+
+ if (*prev_src_type == NOT_INIT) {
/* saw a valid insn
* dst_reg = *(u32 *)(src_reg + off)
- * use reserved 'imm' field to mark this insn
+ * save type to validate intersecting paths
*/
- insn->imm = src_reg_type;
+ *prev_src_type = src_reg_type;
- } else if (src_reg_type != insn->imm &&
+ } else if (src_reg_type != *prev_src_type &&
(src_reg_type == PTR_TO_CTX ||
- insn->imm == PTR_TO_CTX)) {
+ *prev_src_type == PTR_TO_CTX)) {
/* ABuser program is trying to use the same insn
* dst_reg = *(u32*) (src_reg + off)
* with different pointer types:
@@ -1818,10 +1899,10 @@ static int do_check(struct verifier_env *env)
}
} else if (class == BPF_STX) {
- enum bpf_reg_type dst_reg_type;
+ enum bpf_reg_type *prev_dst_type, dst_reg_type;
if (BPF_MODE(insn->code) == BPF_XADD) {
- err = check_xadd(env, insn);
+ err = check_xadd(env, insn_idx, insn);
if (err)
return err;
insn_idx++;
@@ -1840,17 +1921,19 @@ static int do_check(struct verifier_env *env)
dst_reg_type = regs[insn->dst_reg].type;
/* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, insn->dst_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
insn->src_reg);
if (err)
return err;
- if (insn->imm == 0) {
- insn->imm = dst_reg_type;
- } else if (dst_reg_type != insn->imm &&
+ prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type;
+
+ if (*prev_dst_type == NOT_INIT) {
+ *prev_dst_type = dst_reg_type;
+ } else if (dst_reg_type != *prev_dst_type &&
(dst_reg_type == PTR_TO_CTX ||
- insn->imm == PTR_TO_CTX)) {
+ *prev_dst_type == PTR_TO_CTX)) {
verbose("same insn cannot be used with different pointers\n");
return -EINVAL;
}
@@ -1866,8 +1949,14 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose("BPF_ST stores into R%d context is not allowed\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, insn->dst_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
-1);
if (err)
@@ -1885,7 +1974,7 @@ static int do_check(struct verifier_env *env)
return -EINVAL;
}
- err = check_call(env, insn->imm);
+ err = check_call(env, insn->imm, insn_idx);
if (err)
return err;
@@ -1952,6 +2041,7 @@ process_bpf_exit:
return err;
insn_idx++;
+ env->insn_aux_data[insn_idx].seen = true;
} else {
verbose("invalid BPF_LD mode\n");
return -EINVAL;
@@ -2037,7 +2127,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
/* hold the map. If the program is rejected by verifier,
* the map will be released by release_maps() or it
* will be used by the valid program until it's unloaded
- * and all maps are released in free_bpf_prog_info()
+ * and all maps are released in free_used_maps()
*/
map = bpf_map_inc(map, false);
if (IS_ERR(map)) {
@@ -2081,23 +2171,60 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
insn->src_reg = 0;
}
-static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
+/* single env->prog->insni[off] instruction was replaced with the range
+ * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
+ * [0, off) and [off, end) to new locations, so the patched range stays zero
+ */
+static int adjust_insn_aux_data(struct verifier_env *env, u32 prog_len,
+ u32 off, u32 cnt)
{
- struct bpf_insn *insn = prog->insnsi;
- int insn_cnt = prog->len;
+ struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
int i;
- for (i = 0; i < insn_cnt; i++, insn++) {
- if (BPF_CLASS(insn->code) != BPF_JMP ||
- BPF_OP(insn->code) == BPF_CALL ||
- BPF_OP(insn->code) == BPF_EXIT)
- continue;
+ if (cnt == 1)
+ return 0;
+ new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len);
+ if (!new_data)
+ return -ENOMEM;
+ memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
+ memcpy(new_data + off + cnt - 1, old_data + off,
+ sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ for (i = off; i < off + cnt - 1; i++)
+ new_data[i].seen = true;
+ env->insn_aux_data = new_data;
+ vfree(old_data);
+ return 0;
+}
+
+static struct bpf_prog *bpf_patch_insn_data(struct verifier_env *env, u32 off,
+ const struct bpf_insn *patch, u32 len)
+{
+ struct bpf_prog *new_prog;
+
+ new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
+ if (!new_prog)
+ return NULL;
+ if (adjust_insn_aux_data(env, new_prog->len, off, len))
+ return NULL;
+ return new_prog;
+}
+
+/* The verifier does more data flow analysis than llvm and will not explore
+ * branches that are dead at run time. Malicious programs can have dead code
+ * too. Therefore replace all dead at-run-time code with nops.
+ */
+static void sanitize_dead_code(struct verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
- /* adjust offset of jmps if necessary */
- if (i < pos && i + insn->off + 1 > pos)
- insn->off += delta;
- else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
- insn->off -= delta;
+ for (i = 0; i < insn_cnt; i++) {
+ if (aux_data[i].seen)
+ continue;
+ memcpy(insn + i, &nop, sizeof(nop));
}
}
@@ -2107,30 +2234,58 @@ static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
static int convert_ctx_accesses(struct verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
- int insn_cnt = env->prog->len;
+ const int insn_cnt = env->prog->len;
struct bpf_insn insn_buf[16];
struct bpf_prog *new_prog;
- u32 cnt;
- int i;
enum bpf_access_type type;
+ int i, delta = 0;
if (!env->prog->aux->ops->convert_ctx_access)
return 0;
for (i = 0; i < insn_cnt; i++, insn++) {
- if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
+ u32 cnt;
+
+ if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
type = BPF_READ;
- else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
+ else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_DW))
type = BPF_WRITE;
else
continue;
- if (insn->imm != PTR_TO_CTX) {
- /* clear internal mark */
- insn->imm = 0;
+ if (type == BPF_WRITE &&
+ env->insn_aux_data[i + delta].sanitize_stack_off) {
+ struct bpf_insn patch[] = {
+ /* Sanitize suspicious stack slot with zero.
+ * There are no memory dependencies for this store,
+ * since it's only using frame pointer and immediate
+ * constant of zero
+ */
+ BPF_ST_MEM(BPF_DW, BPF_REG_FP,
+ env->insn_aux_data[i + delta].sanitize_stack_off,
+ 0),
+ /* the original STX instruction will immediately
+ * overwrite the same stack slot with appropriate value
+ */
+ *insn,
+ };
+
+ cnt = ARRAY_SIZE(patch);
+ new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
continue;
}
+ if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
+ continue;
+
cnt = env->prog->aux->ops->
convert_ctx_access(type, insn->dst_reg, insn->src_reg,
insn->off, insn_buf, env->prog);
@@ -2139,34 +2294,107 @@ static int convert_ctx_accesses(struct verifier_env *env)
return -EINVAL;
}
- if (cnt == 1) {
- memcpy(insn, insn_buf, sizeof(*insn));
- continue;
- }
-
- /* several new insns need to be inserted. Make room for them */
- insn_cnt += cnt - 1;
- new_prog = bpf_prog_realloc(env->prog,
- bpf_prog_size(insn_cnt),
- GFP_USER);
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
- new_prog->len = insn_cnt;
+ delta += cnt - 1;
- memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1,
- sizeof(*insn) * (insn_cnt - i - cnt));
+ /* keep walking new program and skip insns we just inserted */
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ }
- /* copy substitute insns in place of load instruction */
- memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt);
+ return 0;
+}
- /* adjust branches in the whole program */
- adjust_branches(new_prog, i, cnt - 1);
+/* fixup insn->imm field of bpf_call instructions
+ *
+ * this function is called after eBPF program passed verification
+ */
+static int fixup_bpf_calls(struct verifier_env *env)
+{
+ struct bpf_prog *prog = env->prog;
+ struct bpf_insn *insn = prog->insnsi;
+ const struct bpf_func_proto *fn;
+ const int insn_cnt = prog->len;
+ struct bpf_insn insn_buf[16];
+ struct bpf_prog *new_prog;
+ struct bpf_map *map_ptr;
+ int i, cnt, delta = 0;
- /* keep walking new program and skip insns we just inserted */
- env->prog = new_prog;
- insn = new_prog->insnsi + i + cnt - 1;
- i += cnt - 1;
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+ /* due to JIT bugs clear upper 32-bits of src register
+ * before div/mod operation
+ */
+ insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg);
+ insn_buf[1] = *insn;
+ cnt = 2;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ if (insn->code != (BPF_JMP | BPF_CALL))
+ continue;
+
+ if (insn->imm == BPF_FUNC_get_route_realm)
+ prog->dst_needed = 1;
+ if (insn->imm == BPF_FUNC_get_prandom_u32)
+ bpf_user_rnd_init_once();
+ if (insn->imm == BPF_FUNC_tail_call) {
+ /* mark bpf_tail_call as different opcode to avoid
+ * conditional branch in the interpeter for every normal
+ * call and to prevent accidental JITing by JIT compiler
+ * that doesn't support bpf_tail_call yet
+ */
+ insn->imm = 0;
+ insn->code |= BPF_X;
+
+ /* instead of changing every JIT dealing with tail_call
+ * emit two extra insns:
+ * if (index >= max_entries) goto out;
+ * index &= array->index_mask;
+ * to avoid out-of-bounds cpu speculation
+ */
+ map_ptr = env->insn_aux_data[i + delta].map_ptr;
+ if (!map_ptr->unpriv_array)
+ continue;
+ insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
+ map_ptr->max_entries, 2);
+ insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
+ container_of(map_ptr,
+ struct bpf_array,
+ map)->index_mask);
+ insn_buf[2] = *insn;
+ cnt = 3;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ fn = prog->aux->ops->get_func_proto(insn->imm);
+ /* all functions that have prototype and verifier allowed
+ * programs to call them, must be real in-kernel functions
+ */
+ if (!fn->func) {
+ verbose("kernel subsystem misconfigured func %d\n",
+ insn->imm);
+ return -EFAULT;
+ }
+ insn->imm = fn->func - __bpf_call_base;
}
return 0;
@@ -2210,6 +2438,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
if (!env)
return -ENOMEM;
+ env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
+ (*prog)->len);
+ ret = -ENOMEM;
+ if (!env->insn_aux_data)
+ goto err_free_env;
env->prog = *prog;
/* grab the mutex to protect few globals used by verifier */
@@ -2228,12 +2461,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
/* log_* values have to be sane */
if (log_size < 128 || log_size > UINT_MAX >> 8 ||
log_level == 0 || log_ubuf == NULL)
- goto free_env;
+ goto err_unlock;
ret = -ENOMEM;
log_buf = vmalloc(log_size);
if (!log_buf)
- goto free_env;
+ goto err_unlock;
} else {
log_level = 0;
}
@@ -2262,9 +2495,15 @@ skip_full_check:
free_states(env);
if (ret == 0)
+ sanitize_dead_code(env);
+
+ if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);
+ if (ret == 0)
+ ret = fixup_bpf_calls(env);
+
if (log_level && log_len >= log_size - 1) {
BUG_ON(log_len >= log_size);
/* verifier log exceeded user supplied buffer */
@@ -2302,14 +2541,16 @@ skip_full_check:
free_log_buf:
if (log_level)
vfree(log_buf);
-free_env:
if (!env->prog->aux->used_maps)
/* if we didn't copy map pointers into bpf_prog_info, release
- * them now. Otherwise free_bpf_prog_info() will release them.
+ * them now. Otherwise free_used_maps() will release them.
*/
release_maps(env);
*prog = env->prog;
- kfree(env);
+err_unlock:
mutex_unlock(&bpf_verifier_lock);
+ vfree(env->insn_aux_data);
+err_free_env:
+ kfree(env);
return ret;
}