summaryrefslogtreecommitdiff
path: root/kernel/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/Makefile5
-rw-r--r--kernel/bpf/arraymap.c373
-rw-r--r--kernel/bpf/core.c892
-rw-r--r--kernel/bpf/hashtab.c388
-rw-r--r--kernel/bpf/helpers.c179
-rw-r--r--kernel/bpf/inode.c387
-rw-r--r--kernel/bpf/syscall.c743
-rw-r--r--kernel/bpf/verifier.c2556
8 files changed, 5523 insertions, 0 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
new file mode 100644
index 000000000000..677991f29d66
--- /dev/null
+++ b/kernel/bpf/Makefile
@@ -0,0 +1,5 @@
+obj-y := core.o
+CFLAGS_core.o += $(call cc-disable-warning, override-init)
+
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
new file mode 100644
index 000000000000..daa4e0782cf7
--- /dev/null
+++ b/kernel/bpf/arraymap.c
@@ -0,0 +1,373 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/filter.h>
+#include <linux/perf_event.h>
+
+/* Called from syscall */
+static struct bpf_map *array_map_alloc(union bpf_attr *attr)
+{
+ u32 elem_size, array_size, index_mask, max_entries;
+ bool unpriv = !capable(CAP_SYS_ADMIN);
+ struct bpf_array *array;
+ u64 mask64;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ attr->value_size == 0)
+ return ERR_PTR(-EINVAL);
+
+ if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1))
+ /* if value_size is bigger, the user space won't be able to
+ * access the elements.
+ */
+ return ERR_PTR(-E2BIG);
+
+ elem_size = round_up(attr->value_size, 8);
+
+ max_entries = attr->max_entries;
+
+ /* On 32 bit archs roundup_pow_of_two() with max_entries that has
+ * upper most bit set in u32 space is undefined behavior due to
+ * resulting 1U << 32, so do it manually here in u64 space.
+ */
+ mask64 = fls_long(max_entries - 1);
+ mask64 = 1ULL << mask64;
+ mask64 -= 1;
+
+ index_mask = mask64;
+ if (unpriv) {
+ /* round up array size to nearest power of 2,
+ * since cpu will speculate within index_mask limits
+ */
+ max_entries = index_mask + 1;
+ /* Check for overflows. */
+ if (max_entries < attr->max_entries)
+ return ERR_PTR(-E2BIG);
+ }
+
+ /* check round_up into zero and u32 overflow */
+ if (elem_size == 0 ||
+ max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size)
+ return ERR_PTR(-ENOMEM);
+
+ array_size = sizeof(*array) + max_entries * elem_size;
+
+ /* allocate all map elements and zero-initialize them */
+ array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
+ if (!array) {
+ array = vzalloc(array_size);
+ if (!array)
+ return ERR_PTR(-ENOMEM);
+ }
+ array->index_mask = index_mask;
+ array->map.unpriv_array = unpriv;
+
+ /* copy mandatory map attributes */
+ array->map.key_size = attr->key_size;
+ array->map.value_size = attr->value_size;
+ array->map.max_entries = attr->max_entries;
+ array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
+ array->elem_size = elem_size;
+
+ return &array->map;
+}
+
+/* Called from syscall or from eBPF program */
+static void *array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (index >= array->map.max_entries)
+ return NULL;
+
+ return array->value + array->elem_size * (index & array->index_mask);
+}
+
+/* Called from syscall */
+static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = key ? *(u32 *)key : U32_MAX;
+ u32 *next = (u32 *)next_key;
+
+ if (index >= array->map.max_entries) {
+ *next = 0;
+ return 0;
+ }
+
+ if (index == array->map.max_entries - 1)
+ return -ENOENT;
+
+ *next = index + 1;
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (map_flags > BPF_EXIST)
+ /* unknown flags */
+ return -EINVAL;
+
+ if (index >= array->map.max_entries)
+ /* all elements were pre-allocated, cannot insert a new one */
+ return -E2BIG;
+
+ if (map_flags == BPF_NOEXIST)
+ /* all elements already exist */
+ return -EEXIST;
+
+ memcpy(array->value +
+ array->elem_size * (index & array->index_mask),
+ value, map->value_size);
+ return 0;
+}
+
+/* Called from syscall or from eBPF program */
+static int array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ return -EINVAL;
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void array_map_free(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding programs to complete
+ * and free the array
+ */
+ synchronize_rcu();
+
+ kvfree(array);
+}
+
+static const struct bpf_map_ops array_ops = {
+ .map_alloc = array_map_alloc,
+ .map_free = array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = array_map_lookup_elem,
+ .map_update_elem = array_map_update_elem,
+ .map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list array_type __read_mostly = {
+ .ops = &array_ops,
+ .type = BPF_MAP_TYPE_ARRAY,
+};
+
+static int __init register_array_map(void)
+{
+ bpf_register_map_type(&array_type);
+ return 0;
+}
+late_initcall(register_array_map);
+
+static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
+{
+ /* only file descriptors can be stored in this type of map */
+ if (attr->value_size != sizeof(u32))
+ return ERR_PTR(-EINVAL);
+ return array_map_alloc(attr);
+}
+
+static void fd_array_map_free(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ synchronize_rcu();
+
+ /* make sure it's empty */
+ for (i = 0; i < array->map.max_entries; i++)
+ BUG_ON(array->ptrs[i] != NULL);
+ kvfree(array);
+}
+
+static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+/* only called from syscall */
+static int fd_array_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ void *new_ptr, *old_ptr;
+ u32 index = *(u32 *)key, ufd;
+
+ if (map_flags != BPF_ANY)
+ return -EINVAL;
+
+ if (index >= array->map.max_entries)
+ return -E2BIG;
+
+ ufd = *(u32 *)value;
+ new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+ if (IS_ERR(new_ptr))
+ return PTR_ERR(new_ptr);
+
+ old_ptr = xchg(array->ptrs + index, new_ptr);
+ if (old_ptr)
+ map->ops->map_fd_put_ptr(old_ptr);
+
+ return 0;
+}
+
+static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ void *old_ptr;
+ u32 index = *(u32 *)key;
+
+ if (index >= array->map.max_entries)
+ return -E2BIG;
+
+ old_ptr = xchg(array->ptrs + index, NULL);
+ if (old_ptr) {
+ map->ops->map_fd_put_ptr(old_ptr);
+ return 0;
+ } else {
+ return -ENOENT;
+ }
+}
+
+static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_prog *prog = bpf_prog_get(fd);
+ if (IS_ERR(prog))
+ return prog;
+
+ if (!bpf_prog_array_compatible(array, prog)) {
+ bpf_prog_put(prog);
+ return ERR_PTR(-EINVAL);
+ }
+ return prog;
+}
+
+static void prog_fd_array_put_ptr(void *ptr)
+{
+ bpf_prog_put(ptr);
+}
+
+/* decrement refcnt of all bpf_progs that are stored in this map */
+void bpf_fd_array_map_clear(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ for (i = 0; i < array->map.max_entries; i++)
+ fd_array_map_delete_elem(map, &i);
+}
+
+static const struct bpf_map_ops prog_array_ops = {
+ .map_alloc = fd_array_map_alloc,
+ .map_free = fd_array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_update_elem = fd_array_map_update_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = prog_fd_array_get_ptr,
+ .map_fd_put_ptr = prog_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list prog_array_type __read_mostly = {
+ .ops = &prog_array_ops,
+ .type = BPF_MAP_TYPE_PROG_ARRAY,
+};
+
+static int __init register_prog_array_map(void)
+{
+ bpf_register_map_type(&prog_array_type);
+ return 0;
+}
+late_initcall(register_prog_array_map);
+
+static void perf_event_array_map_free(struct bpf_map *map)
+{
+ bpf_fd_array_map_clear(map);
+ fd_array_map_free(map);
+}
+
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+{
+ struct perf_event *event;
+ const struct perf_event_attr *attr;
+
+ event = perf_event_get(fd);
+ if (IS_ERR(event))
+ return event;
+
+ attr = perf_event_attrs(event);
+ if (IS_ERR(attr))
+ goto err;
+
+ if (attr->inherit)
+ goto err;
+
+ if (attr->type == PERF_TYPE_RAW)
+ return event;
+
+ if (attr->type == PERF_TYPE_HARDWARE)
+ return event;
+
+ if (attr->type == PERF_TYPE_SOFTWARE &&
+ attr->config == PERF_COUNT_SW_BPF_OUTPUT)
+ return event;
+err:
+ perf_event_release_kernel(event);
+ return ERR_PTR(-EINVAL);
+}
+
+static void perf_event_fd_array_put_ptr(void *ptr)
+{
+ struct perf_event *event = ptr;
+
+ perf_event_release_kernel(event);
+}
+
+static const struct bpf_map_ops perf_event_array_ops = {
+ .map_alloc = fd_array_map_alloc,
+ .map_free = perf_event_array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_update_elem = fd_array_map_update_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = perf_event_fd_array_get_ptr,
+ .map_fd_put_ptr = perf_event_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list perf_event_array_type __read_mostly = {
+ .ops = &perf_event_array_ops,
+ .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+};
+
+static int __init register_perf_event_array_map(void)
+{
+ bpf_register_map_type(&perf_event_array_type);
+ return 0;
+}
+late_initcall(register_perf_event_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
new file mode 100644
index 000000000000..eb52d11fdaa7
--- /dev/null
+++ b/kernel/bpf/core.c
@@ -0,0 +1,892 @@
+/*
+ * Linux Socket Filter - Kernel level socket filtering
+ *
+ * Based on the design of the Berkeley Packet Filter. The new
+ * internal format has been designed by PLUMgrid:
+ *
+ * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
+ *
+ * Authors:
+ *
+ * Jay Schulist <jschlst@samba.org>
+ * Alexei Starovoitov <ast@plumgrid.com>
+ * Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Andi Kleen - Fix a few bad bugs and races.
+ * Kris Katterjohn - Added many additional checks in bpf_check_classic()
+ */
+
+#include <linux/filter.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/moduleloader.h>
+#include <linux/bpf.h>
+
+#include <asm/unaligned.h>
+
+/* Registers */
+#define BPF_R0 regs[BPF_REG_0]
+#define BPF_R1 regs[BPF_REG_1]
+#define BPF_R2 regs[BPF_REG_2]
+#define BPF_R3 regs[BPF_REG_3]
+#define BPF_R4 regs[BPF_REG_4]
+#define BPF_R5 regs[BPF_REG_5]
+#define BPF_R6 regs[BPF_REG_6]
+#define BPF_R7 regs[BPF_REG_7]
+#define BPF_R8 regs[BPF_REG_8]
+#define BPF_R9 regs[BPF_REG_9]
+#define BPF_R10 regs[BPF_REG_10]
+
+/* Named registers */
+#define DST regs[insn->dst_reg]
+#define SRC regs[insn->src_reg]
+#define FP regs[BPF_REG_FP]
+#define ARG1 regs[BPF_REG_ARG1]
+#define CTX regs[BPF_REG_CTX]
+#define IMM insn->imm
+
+/* No hurry in this branch
+ *
+ * Exported for the bpf jit load helper.
+ */
+void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
+{
+ u8 *ptr = NULL;
+
+ if (k >= SKF_NET_OFF)
+ ptr = skb_network_header(skb) + k - SKF_NET_OFF;
+ else if (k >= SKF_LL_OFF)
+ ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
+
+ if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
+ return ptr;
+
+ return NULL;
+}
+
+struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+ gfp_extra_flags;
+ struct bpf_prog_aux *aux;
+ struct bpf_prog *fp;
+
+ size = round_up(size, PAGE_SIZE);
+ fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ if (fp == NULL)
+ return NULL;
+
+ kmemcheck_annotate_bitfield(fp, meta);
+
+ aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
+ if (aux == NULL) {
+ vfree(fp);
+ return NULL;
+ }
+
+ fp->pages = size / PAGE_SIZE;
+ fp->aux = aux;
+ fp->aux->prog = fp;
+
+ return fp;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_alloc);
+
+struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
+ gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+ gfp_extra_flags;
+ struct bpf_prog *fp;
+
+ BUG_ON(fp_old == NULL);
+
+ size = round_up(size, PAGE_SIZE);
+ if (size <= fp_old->pages * PAGE_SIZE)
+ return fp_old;
+
+ fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ if (fp != NULL) {
+ kmemcheck_annotate_bitfield(fp, meta);
+
+ memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
+ fp->pages = size / PAGE_SIZE;
+ fp->aux->prog = fp;
+
+ /* We keep fp->aux from fp_old around in the new
+ * reallocated structure.
+ */
+ fp_old->aux = NULL;
+ __bpf_prog_free(fp_old);
+ }
+
+ return fp;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_realloc);
+
+void __bpf_prog_free(struct bpf_prog *fp)
+{
+ kfree(fp->aux);
+ vfree(fp);
+}
+EXPORT_SYMBOL_GPL(__bpf_prog_free);
+
+static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
+{
+ return BPF_CLASS(insn->code) == BPF_JMP &&
+ /* Call and Exit are both special jumps with no
+ * target inside the BPF instruction image.
+ */
+ BPF_OP(insn->code) != BPF_CALL &&
+ BPF_OP(insn->code) != BPF_EXIT;
+}
+
+static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
+{
+ struct bpf_insn *insn = prog->insnsi;
+ u32 i, insn_cnt = prog->len;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (!bpf_is_jmp_and_has_target(insn))
+ continue;
+
+ /* Adjust offset of jmps if we cross boundaries. */
+ if (i < pos && i + insn->off + 1 > pos)
+ insn->off += delta;
+ else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
+ insn->off -= delta;
+ }
+}
+
+struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
+ const struct bpf_insn *patch, u32 len)
+{
+ u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
+ struct bpf_prog *prog_adj;
+
+ /* Since our patchlet doesn't expand the image, we're done. */
+ if (insn_delta == 0) {
+ memcpy(prog->insnsi + off, patch, sizeof(*patch));
+ return prog;
+ }
+
+ insn_adj_cnt = prog->len + insn_delta;
+
+ /* Several new instructions need to be inserted. Make room
+ * for them. Likely, there's no need for a new allocation as
+ * last page could have large enough tailroom.
+ */
+ prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
+ GFP_USER);
+ if (!prog_adj)
+ return NULL;
+
+ prog_adj->len = insn_adj_cnt;
+
+ /* Patching happens in 3 steps:
+ *
+ * 1) Move over tail of insnsi from next instruction onwards,
+ * so we can patch the single target insn with one or more
+ * new ones (patching is always from 1 to n insns, n > 0).
+ * 2) Inject new instructions at the target location.
+ * 3) Adjust branch offsets if necessary.
+ */
+ insn_rest = insn_adj_cnt - off - len;
+
+ memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
+ sizeof(*patch) * insn_rest);
+ memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
+
+ bpf_adj_branches(prog_adj, off, insn_delta);
+
+ return prog_adj;
+}
+
+#ifdef CONFIG_BPF_JIT
+struct bpf_binary_header *
+bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
+ unsigned int alignment,
+ bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ struct bpf_binary_header *hdr;
+ unsigned int size, hole, start;
+
+ /* Most of BPF filters are really small, but if some of them
+ * fill a page, allow at least 128 extra bytes to insert a
+ * random section of illegal instructions.
+ */
+ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
+ hdr = module_alloc(size);
+ if (hdr == NULL)
+ return NULL;
+
+ /* Fill space with illegal/arch-dep instructions. */
+ bpf_fill_ill_insns(hdr, size);
+
+ hdr->pages = size / PAGE_SIZE;
+ hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
+ PAGE_SIZE - sizeof(*hdr));
+ start = (prandom_u32() % hole) & ~(alignment - 1);
+
+ /* Leave a random number of instructions before BPF code. */
+ *image_ptr = &hdr->image[start];
+
+ return hdr;
+}
+
+void bpf_jit_binary_free(struct bpf_binary_header *hdr)
+{
+ module_memfree(hdr);
+}
+#endif /* CONFIG_BPF_JIT */
+
+/* Base function for offset calculation. Needs to go into .text section,
+ * therefore keeping it non-static as well; will also be used by JITs
+ * anyway later on, so do not let the compiler omit it.
+ */
+noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__bpf_call_base);
+
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+/**
+ * __bpf_prog_run - run eBPF program on a given context
+ * @ctx: is the data we are operating on
+ * @insn: is the array of eBPF instructions
+ *
+ * Decode and execute eBPF instructions.
+ */
+static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
+{
+ u64 stack[MAX_BPF_STACK / sizeof(u64)];
+ u64 regs[MAX_BPF_REG], tmp;
+ static const void *jumptable[256] = {
+ [0 ... 255] = &&default_label,
+ /* Now overwrite non-defaults ... */
+ /* 32 bit ALU operations */
+ [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
+ [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
+ [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
+ [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
+ [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
+ [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
+ [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X,
+ [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K,
+ [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
+ [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
+ [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
+ [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
+ [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
+ [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
+ [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
+ [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
+ [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
+ [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
+ [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
+ [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
+ [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
+ [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
+ [BPF_ALU | BPF_NEG] = &&ALU_NEG,
+ [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
+ [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
+ /* 64 bit ALU operations */
+ [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
+ [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
+ [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
+ [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
+ [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
+ [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
+ [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
+ [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
+ [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
+ [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
+ [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
+ [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
+ [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
+ [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
+ [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
+ [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
+ [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
+ [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
+ [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
+ [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
+ [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
+ [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
+ [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
+ [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
+ [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
+ /* Call instruction */
+ [BPF_JMP | BPF_CALL] = &&JMP_CALL,
+ [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
+ /* Jumps */
+ [BPF_JMP | BPF_JA] = &&JMP_JA,
+ [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
+ [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
+ [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
+ [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
+ [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
+ [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+ [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
+ [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+ [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
+ [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+ [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
+ [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+ [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
+ [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
+ /* Program return */
+ [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
+ /* Store instructions */
+ [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
+ [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
+ [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
+ [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
+ [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
+ [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
+ [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
+ [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
+ [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
+ [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
+ /* Load instructions */
+ [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
+ [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
+ [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
+ [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
+ [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
+ [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
+ [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
+ [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
+ [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
+ [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
+ [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
+ };
+ u32 tail_call_cnt = 0;
+ void *ptr;
+ int off;
+
+#define CONT ({ insn++; goto select_insn; })
+#define CONT_JMP ({ insn++; goto select_insn; })
+
+ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
+ ARG1 = (u64) (unsigned long) ctx;
+
+ /* Registers used in classic BPF programs need to be reset first. */
+ regs[BPF_REG_A] = 0;
+ regs[BPF_REG_X] = 0;
+
+select_insn:
+ goto *jumptable[insn->code];
+
+ /* ALU */
+#define ALU(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP SRC; \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP (u32) SRC; \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
+ CONT;
+
+ ALU(ADD, +)
+ ALU(SUB, -)
+ ALU(AND, &)
+ ALU(OR, |)
+ ALU(LSH, <<)
+ ALU(RSH, >>)
+ ALU(XOR, ^)
+ ALU(MUL, *)
+#undef ALU
+ ALU_NEG:
+ DST = (u32) -DST;
+ CONT;
+ ALU64_NEG:
+ DST = -DST;
+ CONT;
+ ALU_MOV_X:
+ DST = (u32) SRC;
+ CONT;
+ ALU_MOV_K:
+ DST = (u32) IMM;
+ CONT;
+ ALU64_MOV_X:
+ DST = SRC;
+ CONT;
+ ALU64_MOV_K:
+ DST = IMM;
+ CONT;
+ LD_IMM_DW:
+ DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
+ insn++;
+ CONT;
+ ALU64_ARSH_X:
+ (*(s64 *) &DST) >>= SRC;
+ CONT;
+ ALU64_ARSH_K:
+ (*(s64 *) &DST) >>= IMM;
+ CONT;
+ ALU64_MOD_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ div64_u64_rem(DST, SRC, &tmp);
+ DST = tmp;
+ CONT;
+ ALU_MOD_X:
+ if (unlikely((u32)SRC == 0))
+ return 0;
+ tmp = (u32) DST;
+ DST = do_div(tmp, (u32) SRC);
+ CONT;
+ ALU64_MOD_K:
+ div64_u64_rem(DST, IMM, &tmp);
+ DST = tmp;
+ CONT;
+ ALU_MOD_K:
+ tmp = (u32) DST;
+ DST = do_div(tmp, (u32) IMM);
+ CONT;
+ ALU64_DIV_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ DST = div64_u64(DST, SRC);
+ CONT;
+ ALU_DIV_X:
+ if (unlikely((u32)SRC == 0))
+ return 0;
+ tmp = (u32) DST;
+ do_div(tmp, (u32) SRC);
+ DST = (u32) tmp;
+ CONT;
+ ALU64_DIV_K:
+ DST = div64_u64(DST, IMM);
+ CONT;
+ ALU_DIV_K:
+ tmp = (u32) DST;
+ do_div(tmp, (u32) IMM);
+ DST = (u32) tmp;
+ CONT;
+ ALU_END_TO_BE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) cpu_to_be16(DST);
+ break;
+ case 32:
+ DST = (__force u32) cpu_to_be32(DST);
+ break;
+ case 64:
+ DST = (__force u64) cpu_to_be64(DST);
+ break;
+ }
+ CONT;
+ ALU_END_TO_LE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) cpu_to_le16(DST);
+ break;
+ case 32:
+ DST = (__force u32) cpu_to_le32(DST);
+ break;
+ case 64:
+ DST = (__force u64) cpu_to_le64(DST);
+ break;
+ }
+ CONT;
+
+ /* CALL */
+ JMP_CALL:
+ /* Function call scratches BPF_R1-BPF_R5 registers,
+ * preserves BPF_R6-BPF_R9, and stores return value
+ * into BPF_R0.
+ */
+ BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
+ BPF_R4, BPF_R5);
+ CONT;
+
+ JMP_TAIL_CALL: {
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_prog *prog;
+ u32 index = BPF_R3;
+
+ if (unlikely(index >= array->map.max_entries))
+ goto out;
+
+ if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+ goto out;
+
+ tail_call_cnt++;
+
+ prog = READ_ONCE(array->ptrs[index]);
+ if (unlikely(!prog))
+ goto out;
+
+ /* ARG1 at this point is guaranteed to point to CTX from
+ * the verifier side due to the fact that the tail call is
+ * handeled like a helper, that is, bpf_tail_call_proto,
+ * where arg1_type is ARG_PTR_TO_CTX.
+ */
+ insn = prog->insnsi;
+ goto select_insn;
+out:
+ CONT;
+ }
+ /* JMP */
+ JMP_JA:
+ insn += insn->off;
+ CONT;
+ JMP_JEQ_X:
+ if (DST == SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JEQ_K:
+ if (DST == IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JNE_X:
+ if (DST != SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JNE_K:
+ if (DST != IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGT_X:
+ if (DST > SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGT_K:
+ if (DST > IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGE_X:
+ if (DST >= SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGE_K:
+ if (DST >= IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGT_X:
+ if (((s64) DST) > ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGT_K:
+ if (((s64) DST) > ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGE_X:
+ if (((s64) DST) >= ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGE_K:
+ if (((s64) DST) >= ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSET_X:
+ if (DST & SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSET_K:
+ if (DST & IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_EXIT:
+ return BPF_R0;
+
+ /* STX and ST and LDX*/
+#define LDST(SIZEOP, SIZE) \
+ STX_MEM_##SIZEOP: \
+ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
+ CONT; \
+ ST_MEM_##SIZEOP: \
+ *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
+ CONT; \
+ LDX_MEM_##SIZEOP: \
+ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT;
+
+ LDST(B, u8)
+ LDST(H, u16)
+ LDST(W, u32)
+ LDST(DW, u64)
+#undef LDST
+ STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
+ atomic_add((u32) SRC, (atomic_t *)(unsigned long)
+ (DST + insn->off));
+ CONT;
+ STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
+ atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
+ (DST + insn->off));
+ CONT;
+ LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
+ off = IMM;
+load_word:
+ /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
+ * only appearing in the programs where ctx ==
+ * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
+ * == BPF_R6, bpf_convert_filter() saves it in BPF_R6,
+ * internal BPF verifier will check that BPF_R6 ==
+ * ctx.
+ *
+ * BPF_ABS and BPF_IND are wrappers of function calls,
+ * so they scratch BPF_R1-BPF_R5 registers, preserve
+ * BPF_R6-BPF_R9, and store return value into BPF_R0.
+ *
+ * Implicit input:
+ * ctx == skb == BPF_R6 == CTX
+ *
+ * Explicit input:
+ * SRC == any register
+ * IMM == 32-bit immediate
+ *
+ * Output:
+ * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
+ */
+
+ ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = get_unaligned_be32(ptr);
+ CONT;
+ }
+
+ return 0;
+ LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
+ off = IMM;
+load_half:
+ ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = get_unaligned_be16(ptr);
+ CONT;
+ }
+
+ return 0;
+ LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
+ off = IMM;
+load_byte:
+ ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = *(u8 *)ptr;
+ CONT;
+ }
+
+ return 0;
+ LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
+ off = IMM + SRC;
+ goto load_word;
+ LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
+ off = IMM + SRC;
+ goto load_half;
+ LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
+ off = IMM + SRC;
+ goto load_byte;
+
+ default_label:
+ /* If we ever reach this, we have a bug somewhere. */
+ WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
+ return 0;
+}
+
+#else
+static unsigned int __bpf_prog_ret0(void *ctx, const struct bpf_insn *insn)
+{
+ return 0;
+}
+#endif
+
+bool bpf_prog_array_compatible(struct bpf_array *array,
+ const struct bpf_prog *fp)
+{
+ if (!array->owner_prog_type) {
+ /* There's no owner yet where we could check for
+ * compatibility.
+ */
+ array->owner_prog_type = fp->type;
+ array->owner_jited = fp->jited;
+
+ return true;
+ }
+
+ return array->owner_prog_type == fp->type &&
+ array->owner_jited == fp->jited;
+}
+
+static int bpf_check_tail_call(const struct bpf_prog *fp)
+{
+ struct bpf_prog_aux *aux = fp->aux;
+ int i;
+
+ for (i = 0; i < aux->used_map_cnt; i++) {
+ struct bpf_map *map = aux->used_maps[i];
+ struct bpf_array *array;
+
+ if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ continue;
+
+ array = container_of(map, struct bpf_array, map);
+ if (!bpf_prog_array_compatible(array, fp))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * bpf_prog_select_runtime - select exec runtime for BPF program
+ * @fp: bpf_prog populated with internal BPF program
+ *
+ * Try to JIT eBPF program, if JIT is not available, use interpreter.
+ * The BPF program will be executed via BPF_PROG_RUN() macro.
+ */
+int bpf_prog_select_runtime(struct bpf_prog *fp)
+{
+#ifndef CONFIG_BPF_JIT_ALWAYS_ON
+ fp->bpf_func = (void *) __bpf_prog_run;
+#else
+ fp->bpf_func = (void *) __bpf_prog_ret0;
+#endif
+
+ /* eBPF JITs can rewrite the program in case constant
+ * blinding is active. However, in case of error during
+ * blinding, bpf_int_jit_compile() must always return a
+ * valid program, which in this case would simply not
+ * be JITed, but falls back to the interpreter.
+ */
+ bpf_int_jit_compile(fp);
+#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+ if (!fp->jited)
+ return -ENOTSUPP;
+#endif
+ bpf_prog_lock_ro(fp);
+
+ /* The tail call compatibility check can only be done at
+ * this late stage as we need to determine, if we deal
+ * with JITed or non JITed program concatenations and not
+ * all eBPF JITs might immediately support all features.
+ */
+ return bpf_check_tail_call(fp);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
+
+static void bpf_prog_free_deferred(struct work_struct *work)
+{
+ struct bpf_prog_aux *aux;
+
+ aux = container_of(work, struct bpf_prog_aux, work);
+ bpf_jit_free(aux->prog);
+}
+
+/* Free internal BPF program */
+void bpf_prog_free(struct bpf_prog *fp)
+{
+ struct bpf_prog_aux *aux = fp->aux;
+
+ INIT_WORK(&aux->work, bpf_prog_free_deferred);
+ schedule_work(&aux->work);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_free);
+
+/* RNG for unpriviledged user space with separated state from prandom_u32(). */
+static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
+
+void bpf_user_rnd_init_once(void)
+{
+ prandom_init_once(&bpf_user_rnd_state);
+}
+
+u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* Should someone ever have the rather unwise idea to use some
+ * of the registers passed into this function, then note that
+ * this function is called from native eBPF and classic-to-eBPF
+ * transformations. Register assignments from both sides are
+ * different, f.e. classic always sets fn(ctx, A, X) here.
+ */
+ struct rnd_state *state;
+ u32 res;
+
+ state = &get_cpu_var(bpf_user_rnd_state);
+ res = prandom_u32_state(state);
+ put_cpu_var(state);
+
+ return res;
+}
+
+/* Weak definitions of helper functions in case we don't have bpf syscall. */
+const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
+const struct bpf_func_proto bpf_map_update_elem_proto __weak;
+const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
+
+const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
+const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
+const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
+const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
+{
+ return NULL;
+}
+
+/* Always built-in helper functions. */
+const struct bpf_func_proto bpf_tail_call_proto = {
+ .func = NULL,
+ .gpl_only = false,
+ .ret_type = RET_VOID,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
+void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+{
+}
+
+/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
+ * skb_copy_bits(), so provide a weak definition of it for NET-less config.
+ */
+int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
+ int len)
+{
+ return -EFAULT;
+}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
new file mode 100644
index 000000000000..a35abe048239
--- /dev/null
+++ b/kernel/bpf/hashtab.c
@@ -0,0 +1,388 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+
+struct bpf_htab {
+ struct bpf_map map;
+ struct hlist_head *buckets;
+ raw_spinlock_t lock;
+ u32 count; /* number of elements in this hashtable */
+ u32 n_buckets; /* number of hash buckets */
+ u32 elem_size; /* size of each element in bytes */
+};
+
+/* each htab element is struct htab_elem + key + value */
+struct htab_elem {
+ struct hlist_node hash_node;
+ struct rcu_head rcu;
+ u32 hash;
+ char key[0] __aligned(8);
+};
+
+/* Called from syscall */
+static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_htab *htab;
+ int err, i;
+
+ htab = kzalloc(sizeof(*htab), GFP_USER);
+ if (!htab)
+ return ERR_PTR(-ENOMEM);
+
+ /* mandatory map attributes */
+ htab->map.key_size = attr->key_size;
+ htab->map.value_size = attr->value_size;
+ htab->map.max_entries = attr->max_entries;
+
+ /* check sanity of attributes.
+ * value_size == 0 may be allowed in the future to use map as a set
+ */
+ err = -EINVAL;
+ if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
+ htab->map.value_size == 0)
+ goto free_htab;
+
+ /* hash table size must be power of 2 */
+ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
+
+ err = -E2BIG;
+ if (htab->map.key_size > MAX_BPF_STACK)
+ /* eBPF programs initialize keys on stack, so they cannot be
+ * larger than max stack size
+ */
+ goto free_htab;
+
+ if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) -
+ MAX_BPF_STACK - sizeof(struct htab_elem))
+ /* if value_size is bigger, the user space won't be able to
+ * access the elements via bpf syscall. This check also makes
+ * sure that the elem_size doesn't overflow and it's
+ * kmalloc-able later in htab_map_update_elem()
+ */
+ goto free_htab;
+
+ htab->elem_size = sizeof(struct htab_elem) +
+ round_up(htab->map.key_size, 8) +
+ htab->map.value_size;
+
+ /* prevent zero size kmalloc and check for u32 overflow */
+ if (htab->n_buckets == 0 ||
+ htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
+ goto free_htab;
+
+ if ((u64) htab->n_buckets * sizeof(struct hlist_head) +
+ (u64) htab->elem_size * htab->map.max_entries >=
+ U32_MAX - PAGE_SIZE)
+ /* make sure page count doesn't overflow */
+ goto free_htab;
+
+ htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
+ htab->elem_size * htab->map.max_entries,
+ PAGE_SIZE) >> PAGE_SHIFT;
+
+ err = -ENOMEM;
+ htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+ GFP_USER | __GFP_NOWARN);
+
+ if (!htab->buckets) {
+ htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+ if (!htab->buckets)
+ goto free_htab;
+ }
+
+ for (i = 0; i < htab->n_buckets; i++)
+ INIT_HLIST_HEAD(&htab->buckets[i]);
+
+ raw_spin_lock_init(&htab->lock);
+ htab->count = 0;
+
+ return &htab->map;
+
+free_htab:
+ kfree(htab);
+ return ERR_PTR(err);
+}
+
+static inline u32 htab_map_hash(const void *key, u32 key_len)
+{
+ return jhash(key, key_len, 0);
+}
+
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &htab->buckets[hash & (htab->n_buckets - 1)];
+}
+
+static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+ void *key, u32 key_size)
+{
+ struct htab_elem *l;
+
+ hlist_for_each_entry_rcu(l, head, hash_node)
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+
+ return NULL;
+}
+
+/* Called from syscall or from eBPF program */
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ u32 hash, key_size;
+
+ /* Must be called with rcu_read_lock. */
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ head = select_bucket(htab, hash);
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (l)
+ return l->key + round_up(map->key_size, 8);
+
+ return NULL;
+}
+
+/* Called from syscall */
+static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l, *next_l;
+ u32 hash, key_size;
+ int i = 0;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ if (!key)
+ goto find_first_elem;
+
+ hash = htab_map_hash(key, key_size);
+
+ head = select_bucket(htab, hash);
+
+ /* lookup the key */
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (!l)
+ goto find_first_elem;
+
+ /* key was found, get next key in the same bucket */
+ next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+ struct htab_elem, hash_node);
+
+ if (next_l) {
+ /* if next elem in this hash list is non-zero, just return it */
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+
+ /* no more elements in this hash list, go to the next bucket */
+ i = hash & (htab->n_buckets - 1);
+ i++;
+
+find_first_elem:
+ /* iterate over buckets */
+ for (; i < htab->n_buckets; i++) {
+ head = select_bucket(htab, i);
+
+ /* pick first element in the bucket */
+ next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ struct htab_elem, hash_node);
+ if (next_l) {
+ /* if it's not empty, just return it */
+ memcpy(next_key, next_l->key, key_size);
+ return 0;
+ }
+ }
+
+ /* itereated over all buckets and all elements */
+ return -ENOENT;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new, *l_old;
+ struct hlist_head *head;
+ unsigned long flags;
+ u32 key_size;
+ int ret;
+
+ if (map_flags > BPF_EXIST)
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* allocate new element outside of lock */
+ l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+ if (!l_new)
+ return -ENOMEM;
+
+ key_size = map->key_size;
+
+ memcpy(l_new->key, key, key_size);
+ memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+
+ l_new->hash = htab_map_hash(l_new->key, key_size);
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ raw_spin_lock_irqsave(&htab->lock, flags);
+
+ head = select_bucket(htab, l_new->hash);
+
+ l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+
+ if (!l_old && unlikely(htab->count >= map->max_entries)) {
+ /* if elem with this 'key' doesn't exist and we've reached
+ * max_entries limit, fail insertion of new elem
+ */
+ ret = -E2BIG;
+ goto err;
+ }
+
+ if (l_old && map_flags == BPF_NOEXIST) {
+ /* elem already exists */
+ ret = -EEXIST;
+ goto err;
+ }
+
+ if (!l_old && map_flags == BPF_EXIST) {
+ /* elem doesn't exist, cannot update it */
+ ret = -ENOENT;
+ goto err;
+ }
+
+ /* add new element to the head of the list, so that concurrent
+ * search will find it before old elem
+ */
+ hlist_add_head_rcu(&l_new->hash_node, head);
+ if (l_old) {
+ hlist_del_rcu(&l_old->hash_node);
+ kfree_rcu(l_old, rcu);
+ } else {
+ htab->count++;
+ }
+ raw_spin_unlock_irqrestore(&htab->lock, flags);
+
+ return 0;
+err:
+ raw_spin_unlock_irqrestore(&htab->lock, flags);
+ kfree(l_new);
+ return ret;
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_head *head;
+ struct htab_elem *l;
+ unsigned long flags;
+ u32 hash, key_size;
+ int ret = -ENOENT;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ raw_spin_lock_irqsave(&htab->lock, flags);
+
+ head = select_bucket(htab, hash);
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+
+ if (l) {
+ hlist_del_rcu(&l->hash_node);
+ htab->count--;
+ kfree_rcu(l, rcu);
+ ret = 0;
+ }
+
+ raw_spin_unlock_irqrestore(&htab->lock, flags);
+ return ret;
+}
+
+static void delete_all_elements(struct bpf_htab *htab)
+{
+ int i;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ struct hlist_head *head = select_bucket(htab, i);
+ struct hlist_node *n;
+ struct htab_elem *l;
+
+ hlist_for_each_entry_safe(l, n, head, hash_node) {
+ hlist_del_rcu(&l->hash_node);
+ htab->count--;
+ kfree(l);
+ }
+ }
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void htab_map_free(struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+
+ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+ * so the programs (can be more than one that used this map) were
+ * disconnected from events. Wait for outstanding critical sections in
+ * these programs to complete
+ */
+ synchronize_rcu();
+
+ /* some of kfree_rcu() callbacks for elements of this map may not have
+ * executed. It's ok. Proceed to free residual elements and map itself
+ */
+ delete_all_elements(htab);
+ kvfree(htab->buckets);
+ kfree(htab);
+}
+
+static const struct bpf_map_ops htab_ops = {
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_map_lookup_elem,
+ .map_update_elem = htab_map_update_elem,
+ .map_delete_elem = htab_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_type __read_mostly = {
+ .ops = &htab_ops,
+ .type = BPF_MAP_TYPE_HASH,
+};
+
+static int __init register_htab_map(void)
+{
+ bpf_register_map_type(&htab_type);
+ return 0;
+}
+late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
new file mode 100644
index 000000000000..50da680c479f
--- /dev/null
+++ b/kernel/bpf/helpers.c
@@ -0,0 +1,179 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/rcupdate.h>
+#include <linux/random.h>
+#include <linux/smp.h>
+#include <linux/ktime.h>
+#include <linux/sched.h>
+#include <linux/uidgid.h>
+
+/* If kernel subsystem is allowing eBPF programs to call this function,
+ * inside its own verifier_ops->get_func_proto() callback it should return
+ * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
+ *
+ * Different map implementations will rely on rcu in map methods
+ * lookup/update/delete, therefore eBPF programs must run under rcu lock
+ * if program is allowed to access maps, so check rcu_read_lock_held in
+ * all three functions.
+ */
+static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* verifier checked that R1 contains a valid pointer to bpf_map
+ * and R2 points to a program stack and map->key_size bytes were
+ * initialized
+ */
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+ void *value;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ value = map->ops->map_lookup_elem(map, key);
+
+ /* lookup() returns either pointer to element value or NULL
+ * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
+ */
+ return (unsigned long) value;
+}
+
+const struct bpf_func_proto bpf_map_lookup_elem_proto = {
+ .func = bpf_map_lookup_elem,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+
+static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+ void *value = (void *) (unsigned long) r3;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return map->ops->map_update_elem(map, key, value, r4);
+}
+
+const struct bpf_func_proto bpf_map_update_elem_proto = {
+ .func = bpf_map_update_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ .arg3_type = ARG_PTR_TO_MAP_VALUE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ void *key = (void *) (unsigned long) r2;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ return map->ops->map_delete_elem(map, key);
+}
+
+const struct bpf_func_proto bpf_map_delete_elem_proto = {
+ .func = bpf_map_delete_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+};
+
+const struct bpf_func_proto bpf_get_prandom_u32_proto = {
+ .func = bpf_user_rnd_u32,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return raw_smp_processor_id();
+}
+
+const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
+ .func = bpf_get_smp_processor_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* NMI safe access to clock monotonic */
+ return ktime_get_mono_fast_ns();
+}
+
+const struct bpf_func_proto bpf_ktime_get_ns_proto = {
+ .func = bpf_ktime_get_ns,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct task_struct *task = current;
+
+ if (!task)
+ return -EINVAL;
+
+ return (u64) task->tgid << 32 | task->pid;
+}
+
+const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
+ .func = bpf_get_current_pid_tgid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ struct task_struct *task = current;
+ kuid_t uid;
+ kgid_t gid;
+
+ if (!task)
+ return -EINVAL;
+
+ current_uid_gid(&uid, &gid);
+ return (u64) from_kgid(&init_user_ns, gid) << 32 |
+ from_kuid(&init_user_ns, uid);
+}
+
+const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
+ .func = bpf_get_current_uid_gid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
+{
+ struct task_struct *task = current;
+ char *buf = (char *) (long) r1;
+
+ if (!task)
+ return -EINVAL;
+
+ strlcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm)));
+ return 0;
+}
+
+const struct bpf_func_proto bpf_get_current_comm_proto = {
+ .func = bpf_get_current_comm,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_CONST_STACK_SIZE,
+};
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
new file mode 100644
index 000000000000..cb85d228b1ac
--- /dev/null
+++ b/kernel/bpf/inode.c
@@ -0,0 +1,387 @@
+/*
+ * Minimal file system backend for holding eBPF maps and programs,
+ * used by bpf(2) object pinning.
+ *
+ * Authors:
+ *
+ * Daniel Borkmann <daniel@iogearbox.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/magic.h>
+#include <linux/major.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+
+enum bpf_type {
+ BPF_TYPE_UNSPEC = 0,
+ BPF_TYPE_PROG,
+ BPF_TYPE_MAP,
+};
+
+static void *bpf_any_get(void *raw, enum bpf_type type)
+{
+ switch (type) {
+ case BPF_TYPE_PROG:
+ raw = bpf_prog_inc(raw);
+ break;
+ case BPF_TYPE_MAP:
+ raw = bpf_map_inc(raw, true);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ return raw;
+}
+
+static void bpf_any_put(void *raw, enum bpf_type type)
+{
+ switch (type) {
+ case BPF_TYPE_PROG:
+ bpf_prog_put(raw);
+ break;
+ case BPF_TYPE_MAP:
+ bpf_map_put_with_uref(raw);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
+{
+ void *raw;
+
+ *type = BPF_TYPE_MAP;
+ raw = bpf_map_get_with_uref(ufd);
+ if (IS_ERR(raw)) {
+ *type = BPF_TYPE_PROG;
+ raw = bpf_prog_get(ufd);
+ }
+
+ return raw;
+}
+
+static const struct inode_operations bpf_dir_iops;
+
+static const struct inode_operations bpf_prog_iops = { };
+static const struct inode_operations bpf_map_iops = { };
+
+static struct inode *bpf_get_inode(struct super_block *sb,
+ const struct inode *dir,
+ umode_t mode)
+{
+ struct inode *inode;
+
+ switch (mode & S_IFMT) {
+ case S_IFDIR:
+ case S_IFREG:
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOSPC);
+
+ inode->i_ino = get_next_ino();
+ inode->i_atime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime;
+ inode->i_ctime = inode->i_atime;
+
+ inode_init_owner(inode, dir, mode);
+
+ return inode;
+}
+
+static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
+{
+ *type = BPF_TYPE_UNSPEC;
+ if (inode->i_op == &bpf_prog_iops)
+ *type = BPF_TYPE_PROG;
+ else if (inode->i_op == &bpf_map_iops)
+ *type = BPF_TYPE_MAP;
+ else
+ return -EACCES;
+
+ return 0;
+}
+
+static bool bpf_dname_reserved(const struct dentry *dentry)
+{
+ return strchr(dentry->d_name.name, '.');
+}
+
+static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+
+ if (bpf_dname_reserved(dentry))
+ return -EPERM;
+
+ inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = &bpf_dir_iops;
+ inode->i_fop = &simple_dir_operations;
+
+ inc_nlink(inode);
+ inc_nlink(dir);
+
+ d_instantiate(dentry, inode);
+ dget(dentry);
+
+ return 0;
+}
+
+static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
+ umode_t mode, const struct inode_operations *iops)
+{
+ struct inode *inode;
+
+ if (bpf_dname_reserved(dentry))
+ return -EPERM;
+
+ inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = iops;
+ inode->i_private = dentry->d_fsdata;
+
+ d_instantiate(dentry, inode);
+ dget(dentry);
+
+ return 0;
+}
+
+static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
+ dev_t devt)
+{
+ enum bpf_type type = MINOR(devt);
+
+ if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) ||
+ dentry->d_fsdata == NULL)
+ return -EPERM;
+
+ switch (type) {
+ case BPF_TYPE_PROG:
+ return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops);
+ case BPF_TYPE_MAP:
+ return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops);
+ default:
+ return -EPERM;
+ }
+}
+
+static const struct inode_operations bpf_dir_iops = {
+ .lookup = simple_lookup,
+ .mknod = bpf_mkobj,
+ .mkdir = bpf_mkdir,
+ .rmdir = simple_rmdir,
+ .unlink = simple_unlink,
+};
+
+static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
+ enum bpf_type type)
+{
+ struct dentry *dentry;
+ struct inode *dir;
+ struct path path;
+ umode_t mode;
+ dev_t devt;
+ int ret;
+
+ dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+ devt = MKDEV(UNNAMED_MAJOR, type);
+
+ ret = security_path_mknod(&path, dentry, mode, devt);
+ if (ret)
+ goto out;
+
+ dir = d_inode(path.dentry);
+ if (dir->i_op != &bpf_dir_iops) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ dentry->d_fsdata = raw;
+ ret = vfs_mknod(dir, dentry, mode, devt);
+ dentry->d_fsdata = NULL;
+out:
+ done_path_create(&path, dentry);
+ return ret;
+}
+
+int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
+{
+ struct filename *pname;
+ enum bpf_type type;
+ void *raw;
+ int ret;
+
+ pname = getname(pathname);
+ if (IS_ERR(pname))
+ return PTR_ERR(pname);
+
+ raw = bpf_fd_probe_obj(ufd, &type);
+ if (IS_ERR(raw)) {
+ ret = PTR_ERR(raw);
+ goto out;
+ }
+
+ ret = bpf_obj_do_pin(pname, raw, type);
+ if (ret != 0)
+ bpf_any_put(raw, type);
+out:
+ putname(pname);
+ return ret;
+}
+
+static void *bpf_obj_do_get(const struct filename *pathname,
+ enum bpf_type *type)
+{
+ struct inode *inode;
+ struct path path;
+ void *raw;
+ int ret;
+
+ ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ inode = d_backing_inode(path.dentry);
+ ret = inode_permission(inode, MAY_WRITE);
+ if (ret)
+ goto out;
+
+ ret = bpf_inode_type(inode, type);
+ if (ret)
+ goto out;
+
+ raw = bpf_any_get(inode->i_private, *type);
+ if (!IS_ERR(raw))
+ touch_atime(&path);
+
+ path_put(&path);
+ return raw;
+out:
+ path_put(&path);
+ return ERR_PTR(ret);
+}
+
+int bpf_obj_get_user(const char __user *pathname)
+{
+ enum bpf_type type = BPF_TYPE_UNSPEC;
+ struct filename *pname;
+ int ret = -ENOENT;
+ void *raw;
+
+ pname = getname(pathname);
+ if (IS_ERR(pname))
+ return PTR_ERR(pname);
+
+ raw = bpf_obj_do_get(pname, &type);
+ if (IS_ERR(raw)) {
+ ret = PTR_ERR(raw);
+ goto out;
+ }
+
+ if (type == BPF_TYPE_PROG)
+ ret = bpf_prog_new_fd(raw);
+ else if (type == BPF_TYPE_MAP)
+ ret = bpf_map_new_fd(raw);
+ else
+ goto out;
+
+ if (ret < 0)
+ bpf_any_put(raw, type);
+out:
+ putname(pname);
+ return ret;
+}
+
+static void bpf_evict_inode(struct inode *inode)
+{
+ enum bpf_type type;
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+
+ if (!bpf_inode_type(inode, &type))
+ bpf_any_put(inode->i_private, type);
+}
+
+static const struct super_operations bpf_super_ops = {
+ .statfs = simple_statfs,
+ .drop_inode = generic_delete_inode,
+ .evict_inode = bpf_evict_inode,
+};
+
+static int bpf_fill_super(struct super_block *sb, void *data, int silent)
+{
+ static struct tree_descr bpf_rfiles[] = { { "" } };
+ struct inode *inode;
+ int ret;
+
+ ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
+ if (ret)
+ return ret;
+
+ sb->s_op = &bpf_super_ops;
+
+ inode = sb->s_root->d_inode;
+ inode->i_op = &bpf_dir_iops;
+ inode->i_mode &= ~S_IALLUGO;
+ inode->i_mode |= S_ISVTX | S_IRWXUGO;
+
+ return 0;
+}
+
+static struct dentry *bpf_mount(struct file_system_type *type, int flags,
+ const char *dev_name, void *data)
+{
+ return mount_nodev(type, flags, data, bpf_fill_super);
+}
+
+static struct file_system_type bpf_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "bpf",
+ .mount = bpf_mount,
+ .kill_sb = kill_litter_super,
+};
+
+MODULE_ALIAS_FS("bpf");
+
+static int __init bpf_init(void)
+{
+ int ret;
+
+ ret = sysfs_create_mount_point(fs_kobj, "bpf");
+ if (ret)
+ return ret;
+
+ ret = register_filesystem(&bpf_fs_type);
+ if (ret)
+ sysfs_remove_mount_point(fs_kobj, "bpf");
+
+ return ret;
+}
+fs_initcall(bpf_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
new file mode 100644
index 000000000000..01431ef8cf07
--- /dev/null
+++ b/kernel/bpf/syscall.c
@@ -0,0 +1,743 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/license.h>
+#include <linux/filter.h>
+#include <linux/version.h>
+
+int sysctl_unprivileged_bpf_disabled __read_mostly;
+
+static LIST_HEAD(bpf_map_types);
+
+static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
+{
+ struct bpf_map_type_list *tl;
+ struct bpf_map *map;
+
+ list_for_each_entry(tl, &bpf_map_types, list_node) {
+ if (tl->type == attr->map_type) {
+ map = tl->ops->map_alloc(attr);
+ if (IS_ERR(map))
+ return map;
+ map->ops = tl->ops;
+ map->map_type = attr->map_type;
+ return map;
+ }
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+/* boot time registration of different map implementations */
+void bpf_register_map_type(struct bpf_map_type_list *tl)
+{
+ list_add(&tl->list_node, &bpf_map_types);
+}
+
+static int bpf_map_charge_memlock(struct bpf_map *map)
+{
+ struct user_struct *user = get_current_user();
+ unsigned long memlock_limit;
+
+ memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ atomic_long_add(map->pages, &user->locked_vm);
+
+ if (atomic_long_read(&user->locked_vm) > memlock_limit) {
+ atomic_long_sub(map->pages, &user->locked_vm);
+ free_uid(user);
+ return -EPERM;
+ }
+ map->user = user;
+ return 0;
+}
+
+static void bpf_map_uncharge_memlock(struct bpf_map *map)
+{
+ struct user_struct *user = map->user;
+
+ atomic_long_sub(map->pages, &user->locked_vm);
+ free_uid(user);
+}
+
+/* called from workqueue */
+static void bpf_map_free_deferred(struct work_struct *work)
+{
+ struct bpf_map *map = container_of(work, struct bpf_map, work);
+
+ bpf_map_uncharge_memlock(map);
+ /* implementation dependent freeing */
+ map->ops->map_free(map);
+}
+
+static void bpf_map_put_uref(struct bpf_map *map)
+{
+ if (atomic_dec_and_test(&map->usercnt)) {
+ if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
+ bpf_fd_array_map_clear(map);
+ }
+}
+
+/* decrement map refcnt and schedule it for freeing via workqueue
+ * (unrelying map implementation ops->map_free() might sleep)
+ */
+void bpf_map_put(struct bpf_map *map)
+{
+ if (atomic_dec_and_test(&map->refcnt)) {
+ INIT_WORK(&map->work, bpf_map_free_deferred);
+ schedule_work(&map->work);
+ }
+}
+
+void bpf_map_put_with_uref(struct bpf_map *map)
+{
+ bpf_map_put_uref(map);
+ bpf_map_put(map);
+}
+
+static int bpf_map_release(struct inode *inode, struct file *filp)
+{
+ bpf_map_put_with_uref(filp->private_data);
+ return 0;
+}
+
+static const struct file_operations bpf_map_fops = {
+ .release = bpf_map_release,
+};
+
+int bpf_map_new_fd(struct bpf_map *map)
+{
+ return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
+ O_RDWR | O_CLOEXEC);
+}
+
+/* helper macro to check that unused fields 'union bpf_attr' are zero */
+#define CHECK_ATTR(CMD) \
+ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
+ sizeof(attr->CMD##_LAST_FIELD), 0, \
+ sizeof(*attr) - \
+ offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
+ sizeof(attr->CMD##_LAST_FIELD)) != NULL
+
+#define BPF_MAP_CREATE_LAST_FIELD max_entries
+/* called via syscall */
+static int map_create(union bpf_attr *attr)
+{
+ struct bpf_map *map;
+ int err;
+
+ err = CHECK_ATTR(BPF_MAP_CREATE);
+ if (err)
+ return -EINVAL;
+
+ /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
+ map = find_and_alloc_map(attr);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ atomic_set(&map->refcnt, 1);
+ atomic_set(&map->usercnt, 1);
+
+ err = bpf_map_charge_memlock(map);
+ if (err)
+ goto free_map_nouncharge;
+
+ err = bpf_map_new_fd(map);
+ if (err < 0)
+ /* failed to allocate fd */
+ goto free_map;
+
+ return err;
+
+free_map:
+ bpf_map_uncharge_memlock(map);
+free_map_nouncharge:
+ map->ops->map_free(map);
+ return err;
+}
+
+/* if error is returned, fd is released.
+ * On success caller should complete fd access with matching fdput()
+ */
+struct bpf_map *__bpf_map_get(struct fd f)
+{
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+ if (f.file->f_op != &bpf_map_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ return f.file->private_data;
+}
+
+/* prog's and map's refcnt limit */
+#define BPF_MAX_REFCNT 32768
+
+struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref)
+{
+ if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) {
+ atomic_dec(&map->refcnt);
+ return ERR_PTR(-EBUSY);
+ }
+ if (uref)
+ atomic_inc(&map->usercnt);
+ return map;
+}
+
+struct bpf_map *bpf_map_get_with_uref(u32 ufd)
+{
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return map;
+
+ map = bpf_map_inc(map, true);
+ fdput(f);
+
+ return map;
+}
+
+/* helper to convert user pointers passed inside __aligned_u64 fields */
+static void __user *u64_to_ptr(__u64 val)
+{
+ return (void __user *) (unsigned long) val;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+
+static int map_lookup_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *uvalue = u64_to_ptr(attr->value);
+ int ufd = attr->map_fd;
+ struct bpf_map *map;
+ void *key, *value, *ptr;
+ struct fd f;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
+ return -EINVAL;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+ if (!value)
+ goto free_key;
+
+ rcu_read_lock();
+ ptr = map->ops->map_lookup_elem(map, key);
+ if (ptr)
+ memcpy(value, ptr, map->value_size);
+ rcu_read_unlock();
+
+ err = -ENOENT;
+ if (!ptr)
+ goto free_value;
+
+ err = -EFAULT;
+ if (copy_to_user(uvalue, value, map->value_size) != 0)
+ goto free_value;
+
+ err = 0;
+
+free_value:
+ kfree(value);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
+
+static int map_update_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *uvalue = u64_to_ptr(attr->value);
+ int ufd = attr->map_fd;
+ struct bpf_map *map;
+ void *key, *value;
+ struct fd f;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
+ return -EINVAL;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+ if (!value)
+ goto free_key;
+
+ err = -EFAULT;
+ if (copy_from_user(value, uvalue, map->value_size) != 0)
+ goto free_value;
+
+ /* eBPF program that use maps are running under rcu_read_lock(),
+ * therefore all map accessors rely on this fact, so do the same here
+ */
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value, attr->flags);
+ rcu_read_unlock();
+
+free_value:
+ kfree(value);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
+
+static int map_delete_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ int ufd = attr->map_fd;
+ struct bpf_map *map;
+ struct fd f;
+ void *key;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
+ return -EINVAL;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ rcu_read_lock();
+ err = map->ops->map_delete_elem(map, key);
+ rcu_read_unlock();
+
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
+
+static int map_get_next_key(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *unext_key = u64_to_ptr(attr->next_key);
+ int ufd = attr->map_fd;
+ struct bpf_map *map;
+ void *key, *next_key;
+ struct fd f;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
+ return -EINVAL;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if (ukey) {
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+ } else {
+ key = NULL;
+ }
+
+ err = -ENOMEM;
+ next_key = kmalloc(map->key_size, GFP_USER);
+ if (!next_key)
+ goto free_key;
+
+ rcu_read_lock();
+ err = map->ops->map_get_next_key(map, key, next_key);
+ rcu_read_unlock();
+ if (err)
+ goto free_next_key;
+
+ err = -EFAULT;
+ if (copy_to_user(unext_key, next_key, map->key_size) != 0)
+ goto free_next_key;
+
+ err = 0;
+
+free_next_key:
+ kfree(next_key);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+static LIST_HEAD(bpf_prog_types);
+
+static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
+{
+ struct bpf_prog_type_list *tl;
+
+ list_for_each_entry(tl, &bpf_prog_types, list_node) {
+ if (tl->type == type) {
+ prog->aux->ops = tl->ops;
+ prog->type = type;
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+void bpf_register_prog_type(struct bpf_prog_type_list *tl)
+{
+ list_add(&tl->list_node, &bpf_prog_types);
+}
+
+/* drop refcnt on maps used by eBPF program and free auxilary data */
+static void free_used_maps(struct bpf_prog_aux *aux)
+{
+ int i;
+
+ for (i = 0; i < aux->used_map_cnt; i++)
+ bpf_map_put(aux->used_maps[i]);
+
+ kfree(aux->used_maps);
+}
+
+static int bpf_prog_charge_memlock(struct bpf_prog *prog)
+{
+ struct user_struct *user = get_current_user();
+ unsigned long memlock_limit;
+
+ memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ atomic_long_add(prog->pages, &user->locked_vm);
+ if (atomic_long_read(&user->locked_vm) > memlock_limit) {
+ atomic_long_sub(prog->pages, &user->locked_vm);
+ free_uid(user);
+ return -EPERM;
+ }
+ prog->aux->user = user;
+ return 0;
+}
+
+static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
+{
+ struct user_struct *user = prog->aux->user;
+
+ atomic_long_sub(prog->pages, &user->locked_vm);
+ free_uid(user);
+}
+
+static void __bpf_prog_put_rcu(struct rcu_head *rcu)
+{
+ struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
+
+ free_used_maps(aux);
+ bpf_prog_uncharge_memlock(aux->prog);
+ bpf_prog_free(aux->prog);
+}
+
+void bpf_prog_put(struct bpf_prog *prog)
+{
+ if (atomic_dec_and_test(&prog->aux->refcnt))
+ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_put);
+
+static int bpf_prog_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_prog *prog = filp->private_data;
+
+ bpf_prog_put(prog);
+ return 0;
+}
+
+static const struct file_operations bpf_prog_fops = {
+ .release = bpf_prog_release,
+};
+
+int bpf_prog_new_fd(struct bpf_prog *prog)
+{
+ return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
+ O_RDWR | O_CLOEXEC);
+}
+
+static struct bpf_prog *__bpf_prog_get(struct fd f)
+{
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+ if (f.file->f_op != &bpf_prog_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ return f.file->private_data;
+}
+
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+{
+ if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
+ atomic_dec(&prog->aux->refcnt);
+ return ERR_PTR(-EBUSY);
+ }
+ return prog;
+}
+
+/* called by sockets/tracing/seccomp before attaching program to an event
+ * pairs with bpf_prog_put()
+ */
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+ struct fd f = fdget(ufd);
+ struct bpf_prog *prog;
+
+ prog = __bpf_prog_get(f);
+ if (IS_ERR(prog))
+ return prog;
+
+ prog = bpf_prog_inc(prog);
+ fdput(f);
+
+ return prog;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get);
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_PROG_LOAD_LAST_FIELD kern_version
+
+static int bpf_prog_load(union bpf_attr *attr)
+{
+ enum bpf_prog_type type = attr->prog_type;
+ struct bpf_prog *prog;
+ int err;
+ char license[128];
+ bool is_gpl;
+
+ if (CHECK_ATTR(BPF_PROG_LOAD))
+ return -EINVAL;
+
+ /* copy eBPF program license from user space */
+ if (strncpy_from_user(license, u64_to_ptr(attr->license),
+ sizeof(license) - 1) < 0)
+ return -EFAULT;
+ license[sizeof(license) - 1] = 0;
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ is_gpl = license_is_gpl_compatible(license);
+
+ if (attr->insn_cnt >= BPF_MAXINSNS)
+ return -EINVAL;
+
+ if (type == BPF_PROG_TYPE_KPROBE &&
+ attr->kern_version != LINUX_VERSION_CODE)
+ return -EINVAL;
+
+ if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* plain bpf_prog allocation */
+ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
+ if (!prog)
+ return -ENOMEM;
+
+ err = bpf_prog_charge_memlock(prog);
+ if (err)
+ goto free_prog_nouncharge;
+
+ prog->len = attr->insn_cnt;
+
+ err = -EFAULT;
+ if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
+ prog->len * sizeof(struct bpf_insn)) != 0)
+ goto free_prog;
+
+ prog->orig_prog = NULL;
+ prog->jited = 0;
+
+ atomic_set(&prog->aux->refcnt, 1);
+ prog->gpl_compatible = is_gpl ? 1 : 0;
+
+ /* find program type: socket_filter vs tracing_filter */
+ err = find_prog_type(type, prog);
+ if (err < 0)
+ goto free_prog;
+
+ /* run eBPF verifier */
+ err = bpf_check(&prog, attr);
+ if (err < 0)
+ goto free_used_maps;
+
+ /* eBPF program is ready to be JITed */
+ err = bpf_prog_select_runtime(prog);
+ if (err < 0)
+ goto free_used_maps;
+
+ err = bpf_prog_new_fd(prog);
+ if (err < 0)
+ /* failed to allocate fd */
+ goto free_used_maps;
+
+ return err;
+
+free_used_maps:
+ free_used_maps(prog->aux);
+free_prog:
+ bpf_prog_uncharge_memlock(prog);
+free_prog_nouncharge:
+ bpf_prog_free(prog);
+ return err;
+}
+
+#define BPF_OBJ_LAST_FIELD bpf_fd
+
+static int bpf_obj_pin(const union bpf_attr *attr)
+{
+ if (CHECK_ATTR(BPF_OBJ))
+ return -EINVAL;
+
+ return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname));
+}
+
+static int bpf_obj_get(const union bpf_attr *attr)
+{
+ if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+ return -EINVAL;
+
+ return bpf_obj_get_user(u64_to_ptr(attr->pathname));
+}
+
+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+{
+ union bpf_attr attr;
+ int err;
+
+ if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!access_ok(VERIFY_READ, uattr, 1))
+ return -EFAULT;
+
+ if (size > PAGE_SIZE) /* silly large */
+ return -E2BIG;
+
+ /* If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. new
+ * user-space does not rely on any kernel feature
+ * extensions we dont know about yet.
+ */
+ if (size > sizeof(attr)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uattr + sizeof(attr);
+ end = (void __user *)uattr + size;
+
+ for (; addr < end; addr++) {
+ err = get_user(val, addr);
+ if (err)
+ return err;
+ if (val)
+ return -E2BIG;
+ }
+ size = sizeof(attr);
+ }
+
+ /* copy attributes from user space, may be less than sizeof(bpf_attr) */
+ memset(&attr, 0, sizeof(attr));
+ if (copy_from_user(&attr, uattr, size) != 0)
+ return -EFAULT;
+
+ switch (cmd) {
+ case BPF_MAP_CREATE:
+ err = map_create(&attr);
+ break;
+ case BPF_MAP_LOOKUP_ELEM:
+ err = map_lookup_elem(&attr);
+ break;
+ case BPF_MAP_UPDATE_ELEM:
+ err = map_update_elem(&attr);
+ break;
+ case BPF_MAP_DELETE_ELEM:
+ err = map_delete_elem(&attr);
+ break;
+ case BPF_MAP_GET_NEXT_KEY:
+ err = map_get_next_key(&attr);
+ break;
+ case BPF_PROG_LOAD:
+ err = bpf_prog_load(&attr);
+ break;
+ case BPF_OBJ_PIN:
+ err = bpf_obj_pin(&attr);
+ break;
+ case BPF_OBJ_GET:
+ err = bpf_obj_get(&attr);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ return err;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
new file mode 100644
index 000000000000..c43ca9857479
--- /dev/null
+++ b/kernel/bpf/verifier.c
@@ -0,0 +1,2556 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <net/netlink.h>
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+
+/* bpf_check() is a static code analyzer that walks eBPF program
+ * instruction by instruction and updates register/stack state.
+ * All paths of conditional branches are analyzed until 'bpf_exit' insn.
+ *
+ * The first pass is depth-first-search to check that the program is a DAG.
+ * It rejects the following programs:
+ * - larger than BPF_MAXINSNS insns
+ * - if loop is present (detected via back-edge)
+ * - unreachable insns exist (shouldn't be a forest. program = one function)
+ * - out of bounds or malformed jumps
+ * The second pass is all possible path descent from the 1st insn.
+ * Since it's analyzing all pathes through the program, the length of the
+ * analysis is limited to 32k insn, which may be hit even if total number of
+ * insn is less then 4K, but there are too many branches that change stack/regs.
+ * Number of 'branches to be analyzed' is limited to 1k
+ *
+ * On entry to each instruction, each register has a type, and the instruction
+ * changes the types of the registers depending on instruction semantics.
+ * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is
+ * copied to R1.
+ *
+ * All registers are 64-bit.
+ * R0 - return register
+ * R1-R5 argument passing registers
+ * R6-R9 callee saved registers
+ * R10 - frame pointer read-only
+ *
+ * At the start of BPF program the register R1 contains a pointer to bpf_context
+ * and has type PTR_TO_CTX.
+ *
+ * Verifier tracks arithmetic operations on pointers in case:
+ * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20),
+ * 1st insn copies R10 (which has FRAME_PTR) type into R1
+ * and 2nd arithmetic instruction is pattern matched to recognize
+ * that it wants to construct a pointer to some element within stack.
+ * So after 2nd insn, the register R1 has type PTR_TO_STACK
+ * (and -20 constant is saved for further stack bounds checking).
+ * Meaning that this reg is a pointer to stack plus known immediate constant.
+ *
+ * Most of the time the registers have UNKNOWN_VALUE type, which
+ * means the register has some value, but it's not a valid pointer.
+ * (like pointer plus pointer becomes UNKNOWN_VALUE type)
+ *
+ * When verifier sees load or store instructions the type of base register
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
+ * types recognized by check_mem_access() function.
+ *
+ * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
+ * and the range of [ptr, ptr + map's value_size) is accessible.
+ *
+ * registers used to pass values to function calls are checked against
+ * function argument constraints.
+ *
+ * ARG_PTR_TO_MAP_KEY is one of such argument constraints.
+ * It means that the register type passed to this function must be
+ * PTR_TO_STACK and it will be used inside the function as
+ * 'pointer to map element key'
+ *
+ * For example the argument constraints for bpf_map_lookup_elem():
+ * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ * .arg1_type = ARG_CONST_MAP_PTR,
+ * .arg2_type = ARG_PTR_TO_MAP_KEY,
+ *
+ * ret_type says that this function returns 'pointer to map elem value or null'
+ * function expects 1st argument to be a const pointer to 'struct bpf_map' and
+ * 2nd argument should be a pointer to stack, which will be used inside
+ * the helper function as a pointer to map element key.
+ *
+ * On the kernel side the helper function looks like:
+ * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+ * {
+ * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ * void *key = (void *) (unsigned long) r2;
+ * void *value;
+ *
+ * here kernel can access 'key' and 'map' pointers safely, knowing that
+ * [key, key + map->key_size) bytes are valid and were initialized on
+ * the stack of eBPF program.
+ * }
+ *
+ * Corresponding eBPF program may look like:
+ * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR
+ * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK
+ * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP
+ * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ * here verifier looks at prototype of map_lookup_elem() and sees:
+ * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok,
+ * Now verifier knows that this map has key of R1->map_ptr->key_size bytes
+ *
+ * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far,
+ * Now verifier checks that [R2, R2 + map's key_size) are within stack limits
+ * and were initialized prior to this call.
+ * If it's ok, then verifier allows this BPF_CALL insn and looks at
+ * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
+ * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
+ * returns ether pointer to map value or NULL.
+ *
+ * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
+ * insn, the register holding that pointer in the true branch changes state to
+ * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false
+ * branch. See check_cond_jmp_op().
+ *
+ * After the call R0 is set to return type of the function and registers R1-R5
+ * are set to NOT_INIT to indicate that they are no longer readable.
+ */
+
+/* types of values stored in eBPF registers */
+enum bpf_reg_type {
+ NOT_INIT = 0, /* nothing was written into register */
+ UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */
+ PTR_TO_CTX, /* reg points to bpf_context */
+ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
+ PTR_TO_MAP_VALUE, /* reg points to map element value */
+ PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
+ FRAME_PTR, /* reg == frame_pointer */
+ PTR_TO_STACK, /* reg == frame_pointer + imm */
+ CONST_IMM, /* constant integer value */
+};
+
+struct reg_state {
+ enum bpf_reg_type type;
+ union {
+ /* valid when type == CONST_IMM | PTR_TO_STACK */
+ int imm;
+
+ /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
+ * PTR_TO_MAP_VALUE_OR_NULL
+ */
+ struct bpf_map *map_ptr;
+ };
+};
+
+enum bpf_stack_slot_type {
+ STACK_INVALID, /* nothing was stored in this stack slot */
+ STACK_SPILL, /* register spilled into stack */
+ STACK_MISC /* BPF program wrote some data into this slot */
+};
+
+#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
+
+/* state of the program:
+ * type of all registers and stack info
+ */
+struct verifier_state {
+ struct reg_state regs[MAX_BPF_REG];
+ u8 stack_slot_type[MAX_BPF_STACK];
+ struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
+};
+
+/* linked list of verifier states used to prune search */
+struct verifier_state_list {
+ struct verifier_state state;
+ struct verifier_state_list *next;
+};
+
+/* verifier_state + insn_idx are pushed to stack when branch is encountered */
+struct verifier_stack_elem {
+ /* verifer state is 'st'
+ * before processing instruction 'insn_idx'
+ * and after processing instruction 'prev_insn_idx'
+ */
+ struct verifier_state st;
+ int insn_idx;
+ int prev_insn_idx;
+ struct verifier_stack_elem *next;
+};
+
+struct bpf_insn_aux_data {
+ union {
+ enum bpf_reg_type ptr_type; /* pointer type for load/store insns */
+ struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */
+ };
+ int sanitize_stack_off; /* stack slot to be cleared */
+ bool seen; /* this insn was processed by the verifier */
+};
+
+#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
+
+/* single container for all structs
+ * one verifier_env per bpf_check() call
+ */
+struct verifier_env {
+ struct bpf_prog *prog; /* eBPF program being verified */
+ struct verifier_stack_elem *head; /* stack of verifier states to be processed */
+ int stack_size; /* number of states to be processed */
+ struct verifier_state cur_state; /* current verifier state */
+ struct verifier_state_list **explored_states; /* search pruning optimization */
+ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
+ u32 used_map_cnt; /* number of used maps */
+ bool allow_ptr_leaks;
+ struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
+};
+
+/* verbose verifier prints what it's seeing
+ * bpf_check() is called under lock, so no race to access these global vars
+ */
+static u32 log_level, log_size, log_len;
+static char *log_buf;
+
+static DEFINE_MUTEX(bpf_verifier_lock);
+
+/* log_level controls verbosity level of eBPF verifier.
+ * verbose() is used to dump the verification trace to the log, so the user
+ * can figure out what's wrong with the program
+ */
+static __printf(1, 2) void verbose(const char *fmt, ...)
+{
+ va_list args;
+
+ if (log_level == 0 || log_len >= log_size - 1)
+ return;
+
+ va_start(args, fmt);
+ log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+ va_end(args);
+}
+
+/* string representation of 'enum bpf_reg_type' */
+static const char * const reg_type_str[] = {
+ [NOT_INIT] = "?",
+ [UNKNOWN_VALUE] = "inv",
+ [PTR_TO_CTX] = "ctx",
+ [CONST_PTR_TO_MAP] = "map_ptr",
+ [PTR_TO_MAP_VALUE] = "map_value",
+ [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
+ [FRAME_PTR] = "fp",
+ [PTR_TO_STACK] = "fp",
+ [CONST_IMM] = "imm",
+};
+
+static void print_verifier_state(struct verifier_env *env)
+{
+ enum bpf_reg_type t;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ t = env->cur_state.regs[i].type;
+ if (t == NOT_INIT)
+ continue;
+ verbose(" R%d=%s", i, reg_type_str[t]);
+ if (t == CONST_IMM || t == PTR_TO_STACK)
+ verbose("%d", env->cur_state.regs[i].imm);
+ else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
+ t == PTR_TO_MAP_VALUE_OR_NULL)
+ verbose("(ks=%d,vs=%d)",
+ env->cur_state.regs[i].map_ptr->key_size,
+ env->cur_state.regs[i].map_ptr->value_size);
+ }
+ for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+ if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
+ verbose(" fp%d=%s", -MAX_BPF_STACK + i,
+ reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
+ }
+ verbose("\n");
+}
+
+static const char *const bpf_class_string[] = {
+ [BPF_LD] = "ld",
+ [BPF_LDX] = "ldx",
+ [BPF_ST] = "st",
+ [BPF_STX] = "stx",
+ [BPF_ALU] = "alu",
+ [BPF_JMP] = "jmp",
+ [BPF_RET] = "BUG",
+ [BPF_ALU64] = "alu64",
+};
+
+static const char *const bpf_alu_string[16] = {
+ [BPF_ADD >> 4] = "+=",
+ [BPF_SUB >> 4] = "-=",
+ [BPF_MUL >> 4] = "*=",
+ [BPF_DIV >> 4] = "/=",
+ [BPF_OR >> 4] = "|=",
+ [BPF_AND >> 4] = "&=",
+ [BPF_LSH >> 4] = "<<=",
+ [BPF_RSH >> 4] = ">>=",
+ [BPF_NEG >> 4] = "neg",
+ [BPF_MOD >> 4] = "%=",
+ [BPF_XOR >> 4] = "^=",
+ [BPF_MOV >> 4] = "=",
+ [BPF_ARSH >> 4] = "s>>=",
+ [BPF_END >> 4] = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+ [BPF_W >> 3] = "u32",
+ [BPF_H >> 3] = "u16",
+ [BPF_B >> 3] = "u8",
+ [BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[16] = {
+ [BPF_JA >> 4] = "jmp",
+ [BPF_JEQ >> 4] = "==",
+ [BPF_JGT >> 4] = ">",
+ [BPF_JGE >> 4] = ">=",
+ [BPF_JSET >> 4] = "&",
+ [BPF_JNE >> 4] = "!=",
+ [BPF_JSGT >> 4] = "s>",
+ [BPF_JSGE >> 4] = "s>=",
+ [BPF_CALL >> 4] = "call",
+ [BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_insn(const struct verifier_env *env,
+ const struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (BPF_SRC(insn->code) == BPF_X)
+ verbose("(%02x) %sr%d %s %sr%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->src_reg);
+ else
+ verbose("(%02x) %sr%d %s %s%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->imm);
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->src_reg);
+ else if (BPF_MODE(insn->code) == BPF_XADD)
+ verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ insn->src_reg);
+ else
+ verbose("BUG_%02x\n", insn->code);
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose("BUG_st_%02x\n", insn->code);
+ return;
+ }
+ verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose("BUG_ldx_%02x\n", insn->code);
+ return;
+ }
+ verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
+ insn->code, insn->dst_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (class == BPF_LD) {
+ if (BPF_MODE(insn->code) == BPF_ABS) {
+ verbose("(%02x) r0 = *(%s *)skb[%d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IND) {
+ verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM &&
+ BPF_SIZE(insn->code) == BPF_DW) {
+ /* At this point, we already made sure that the second
+ * part of the ldimm64 insn is accessible.
+ */
+ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+ bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+ if (map_ptr && !env->allow_ptr_leaks)
+ imm = 0;
+
+ verbose("(%02x) r%d = 0x%llx\n", insn->code,
+ insn->dst_reg, (unsigned long long)imm);
+ } else {
+ verbose("BUG_ld_%02x\n", insn->code);
+ return;
+ }
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ verbose("(%02x) call %d\n", insn->code, insn->imm);
+ } else if (insn->code == (BPF_JMP | BPF_JA)) {
+ verbose("(%02x) goto pc%+d\n",
+ insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+ verbose("(%02x) exit\n", insn->code);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose("(%02x) if r%d %s r%d goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->src_reg, insn->off);
+ } else {
+ verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->imm, insn->off);
+ }
+ } else {
+ verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
+ }
+}
+
+static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
+{
+ struct verifier_stack_elem *elem;
+ int insn_idx;
+
+ if (env->head == NULL)
+ return -1;
+
+ memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
+ insn_idx = env->head->insn_idx;
+ if (prev_insn_idx)
+ *prev_insn_idx = env->head->prev_insn_idx;
+ elem = env->head->next;
+ kfree(env->head);
+ env->head = elem;
+ env->stack_size--;
+ return insn_idx;
+}
+
+static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
+ int prev_insn_idx)
+{
+ struct verifier_stack_elem *elem;
+
+ elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
+ if (!elem)
+ goto err;
+
+ memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
+ elem->insn_idx = insn_idx;
+ elem->prev_insn_idx = prev_insn_idx;
+ elem->next = env->head;
+ env->head = elem;
+ env->stack_size++;
+ if (env->stack_size > 1024) {
+ verbose("BPF program is too complex\n");
+ goto err;
+ }
+ return &elem->st;
+err:
+ /* pop all elements and return */
+ while (pop_stack(env, NULL) >= 0);
+ return NULL;
+}
+
+#define CALLER_SAVED_REGS 6
+static const int caller_saved[CALLER_SAVED_REGS] = {
+ BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
+};
+
+static void init_reg_state(struct reg_state *regs)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ regs[i].type = NOT_INIT;
+ regs[i].imm = 0;
+ regs[i].map_ptr = NULL;
+ }
+
+ /* frame pointer */
+ regs[BPF_REG_FP].type = FRAME_PTR;
+
+ /* 1st arg to a function */
+ regs[BPF_REG_1].type = PTR_TO_CTX;
+}
+
+static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
+{
+ BUG_ON(regno >= MAX_BPF_REG);
+ regs[regno].type = UNKNOWN_VALUE;
+ regs[regno].imm = 0;
+ regs[regno].map_ptr = NULL;
+}
+
+enum reg_arg_type {
+ SRC_OP, /* register is used as source operand */
+ DST_OP, /* register is used as destination operand */
+ DST_OP_NO_MARK /* same as above, check only, don't mark */
+};
+
+static int check_reg_arg(struct reg_state *regs, u32 regno,
+ enum reg_arg_type t)
+{
+ if (regno >= MAX_BPF_REG) {
+ verbose("R%d is invalid\n", regno);
+ return -EINVAL;
+ }
+
+ if (t == SRC_OP) {
+ /* check whether register used as source operand can be read */
+ if (regs[regno].type == NOT_INIT) {
+ verbose("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+ } else {
+ /* check whether register used as dest operand can be written to */
+ if (regno == BPF_REG_FP) {
+ verbose("frame pointer is read only\n");
+ return -EACCES;
+ }
+ if (t == DST_OP)
+ mark_reg_unknown_value(regs, regno);
+ }
+ return 0;
+}
+
+static int bpf_size_to_bytes(int bpf_size)
+{
+ if (bpf_size == BPF_W)
+ return 4;
+ else if (bpf_size == BPF_H)
+ return 2;
+ else if (bpf_size == BPF_B)
+ return 1;
+ else if (bpf_size == BPF_DW)
+ return 8;
+ else
+ return -EINVAL;
+}
+
+static bool is_spillable_regtype(enum bpf_reg_type type)
+{
+ switch (type) {
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MAP_VALUE_OR_NULL:
+ case PTR_TO_STACK:
+ case PTR_TO_CTX:
+ case FRAME_PTR:
+ case CONST_PTR_TO_MAP:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/* check_stack_read/write functions track spill/fill of registers,
+ * stack boundary and alignment are checked in check_mem_access()
+ */
+static int check_stack_write(struct verifier_env *env,
+ struct verifier_state *state, int off,
+ int size, int value_regno, int insn_idx)
+{
+ int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
+ /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
+ * so it's aligned access and [off, off + size) are within stack limits
+ */
+
+ if (value_regno >= 0 &&
+ is_spillable_regtype(state->regs[value_regno].type)) {
+
+ /* register containing pointer is being spilled into stack */
+ if (size != BPF_REG_SIZE) {
+ verbose("invalid size of register spill\n");
+ return -EACCES;
+ }
+
+ /* save register state */
+ state->spilled_regs[spi] = state->regs[value_regno];
+
+ for (i = 0; i < BPF_REG_SIZE; i++) {
+ if (state->stack_slot_type[MAX_BPF_STACK + off + i] == STACK_MISC &&
+ !env->allow_ptr_leaks) {
+ int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
+ int soff = (-spi - 1) * BPF_REG_SIZE;
+
+ /* detected reuse of integer stack slot with a pointer
+ * which means either llvm is reusing stack slot or
+ * an attacker is trying to exploit CVE-2018-3639
+ * (speculative store bypass)
+ * Have to sanitize that slot with preemptive
+ * store of zero.
+ */
+ if (*poff && *poff != soff) {
+ /* disallow programs where single insn stores
+ * into two different stack slots, since verifier
+ * cannot sanitize them
+ */
+ verbose("insn %d cannot access two stack slots fp%d and fp%d",
+ insn_idx, *poff, soff);
+ return -EINVAL;
+ }
+ *poff = soff;
+ }
+ state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+ }
+ } else {
+ /* regular write of data into stack */
+ state->spilled_regs[spi] = (struct reg_state) {};
+
+ for (i = 0; i < size; i++)
+ state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
+ }
+ return 0;
+}
+
+static int check_stack_read(struct verifier_state *state, int off, int size,
+ int value_regno)
+{
+ u8 *slot_type;
+ int i;
+
+ slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
+
+ if (slot_type[0] == STACK_SPILL) {
+ if (size != BPF_REG_SIZE) {
+ verbose("invalid size of register spill\n");
+ return -EACCES;
+ }
+ for (i = 1; i < BPF_REG_SIZE; i++) {
+ if (slot_type[i] != STACK_SPILL) {
+ verbose("corrupted spill memory\n");
+ return -EACCES;
+ }
+ }
+
+ if (value_regno >= 0)
+ /* restore register state from stack */
+ state->regs[value_regno] =
+ state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
+ return 0;
+ } else {
+ for (i = 0; i < size; i++) {
+ if (slot_type[i] != STACK_MISC) {
+ verbose("invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ return -EACCES;
+ }
+ }
+ if (value_regno >= 0)
+ /* have read misc data from the stack */
+ mark_reg_unknown_value(state->regs, value_regno);
+ return 0;
+ }
+}
+
+/* check read/write into map element returned by bpf_map_lookup_elem() */
+static int check_map_access(struct verifier_env *env, u32 regno, int off,
+ int size)
+{
+ struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+
+ if (off < 0 || off + size > map->value_size) {
+ verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+ map->value_size, off, size);
+ return -EACCES;
+ }
+ return 0;
+}
+
+/* check access to 'struct bpf_context' fields */
+static int check_ctx_access(struct verifier_env *env, int off, int size,
+ enum bpf_access_type t)
+{
+ if (env->prog->aux->ops->is_valid_access &&
+ env->prog->aux->ops->is_valid_access(off, size, t))
+ return 0;
+
+ verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+ return -EACCES;
+}
+
+static bool is_pointer_value(struct verifier_env *env, int regno)
+{
+ if (env->allow_ptr_leaks)
+ return false;
+
+ switch (env->cur_state.regs[regno].type) {
+ case UNKNOWN_VALUE:
+ case CONST_IMM:
+ return false;
+ default:
+ return true;
+ }
+}
+
+static bool is_ctx_reg(struct verifier_env *env, int regno)
+{
+ const struct reg_state *reg = &env->cur_state.regs[regno];
+
+ return reg->type == PTR_TO_CTX;
+}
+
+/* check whether memory at (regno + off) is accessible for t = (read | write)
+ * if t==write, value_regno is a register which value is stored into memory
+ * if t==read, value_regno is a register which will receive the value from memory
+ * if t==write && value_regno==-1, some unknown value is stored into memory
+ * if t==read && value_regno==-1, don't care what we read from memory
+ */
+static int check_mem_access(struct verifier_env *env, int insn_idx, u32 regno, int off,
+ int bpf_size, enum bpf_access_type t,
+ int value_regno)
+{
+ struct verifier_state *state = &env->cur_state;
+ int size, err = 0;
+
+ if (state->regs[regno].type == PTR_TO_STACK)
+ off += state->regs[regno].imm;
+
+ size = bpf_size_to_bytes(bpf_size);
+ if (size < 0)
+ return size;
+
+ if (off % size != 0) {
+ verbose("misaligned access off %d size %d\n", off, size);
+ return -EACCES;
+ }
+
+ if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose("R%d leaks addr into map\n", value_regno);
+ return -EACCES;
+ }
+ err = check_map_access(env, regno, off, size);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown_value(state->regs, value_regno);
+
+ } else if (state->regs[regno].type == PTR_TO_CTX) {
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose("R%d leaks addr into ctx\n", value_regno);
+ return -EACCES;
+ }
+ err = check_ctx_access(env, off, size, t);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown_value(state->regs, value_regno);
+
+ } else if (state->regs[regno].type == FRAME_PTR ||
+ state->regs[regno].type == PTR_TO_STACK) {
+ if (off >= 0 || off < -MAX_BPF_STACK) {
+ verbose("invalid stack off=%d size=%d\n", off, size);
+ return -EACCES;
+ }
+ if (t == BPF_WRITE) {
+ if (!env->allow_ptr_leaks &&
+ state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
+ size != BPF_REG_SIZE) {
+ verbose("attempt to corrupt spilled pointer on stack\n");
+ return -EACCES;
+ }
+ err = check_stack_write(env, state, off, size,
+ value_regno, insn_idx);
+ } else {
+ err = check_stack_read(state, off, size, value_regno);
+ }
+ } else {
+ verbose("R%d invalid mem access '%s'\n",
+ regno, reg_type_str[state->regs[regno].type]);
+ return -EACCES;
+ }
+ return err;
+}
+
+static int check_xadd(struct verifier_env *env, int insn_idx, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ int err;
+
+ if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
+ insn->imm != 0) {
+ verbose("BPF_XADD uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d leaks addr into mem\n", insn->src_reg);
+ return -EACCES;
+ }
+
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose("BPF_XADD stores into R%d context is not allowed\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
+ /* check whether atomic_add can read the memory */
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, -1);
+ if (err)
+ return err;
+
+ /* check whether atomic_add can write into the same memory */
+ return check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE, -1);
+}
+
+/* when register 'regno' is passed into function that will read 'access_size'
+ * bytes from that pointer, make sure that it's within stack boundary
+ * and all elements of stack are initialized
+ */
+static int check_stack_boundary(struct verifier_env *env,
+ int regno, int access_size)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct reg_state *regs = state->regs;
+ int off, i;
+
+ if (regs[regno].type != PTR_TO_STACK)
+ return -EACCES;
+
+ off = regs[regno].imm;
+ if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
+ access_size <= 0) {
+ verbose("invalid stack type R%d off=%d access_size=%d\n",
+ regno, off, access_size);
+ return -EACCES;
+ }
+
+ for (i = 0; i < access_size; i++) {
+ if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
+ verbose("invalid indirect read from stack off %d+%d size %d\n",
+ off, i, access_size);
+ return -EACCES;
+ }
+ }
+ return 0;
+}
+
+static int check_func_arg(struct verifier_env *env, u32 regno,
+ enum bpf_arg_type arg_type, struct bpf_map **mapp)
+{
+ struct reg_state *reg = env->cur_state.regs + regno;
+ enum bpf_reg_type expected_type;
+ int err = 0;
+
+ if (arg_type == ARG_DONTCARE)
+ return 0;
+
+ if (reg->type == NOT_INIT) {
+ verbose("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+
+ if (arg_type == ARG_ANYTHING) {
+ if (is_pointer_value(env, regno)) {
+ verbose("R%d leaks addr into helper function\n", regno);
+ return -EACCES;
+ }
+ return 0;
+ }
+
+ if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
+ arg_type == ARG_PTR_TO_MAP_VALUE) {
+ expected_type = PTR_TO_STACK;
+ } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ expected_type = CONST_IMM;
+ } else if (arg_type == ARG_CONST_MAP_PTR) {
+ expected_type = CONST_PTR_TO_MAP;
+ } else if (arg_type == ARG_PTR_TO_CTX) {
+ expected_type = PTR_TO_CTX;
+ } else {
+ verbose("unsupported arg_type %d\n", arg_type);
+ return -EFAULT;
+ }
+
+ if (reg->type != expected_type) {
+ verbose("R%d type=%s expected=%s\n", regno,
+ reg_type_str[reg->type], reg_type_str[expected_type]);
+ return -EACCES;
+ }
+
+ if (arg_type == ARG_CONST_MAP_PTR) {
+ /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+ *mapp = reg->map_ptr;
+
+ } else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ /* bpf_map_xxx(..., map_ptr, ..., key) call:
+ * check that [key, key + map->key_size) are within
+ * stack limits and initialized
+ */
+ if (!*mapp) {
+ /* in function declaration map_ptr must come before
+ * map_key, so that it's verified and known before
+ * we have to check map_key here. Otherwise it means
+ * that kernel subsystem misconfigured verifier
+ */
+ verbose("invalid map_ptr to access map->key\n");
+ return -EACCES;
+ }
+ err = check_stack_boundary(env, regno, (*mapp)->key_size);
+
+ } else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
+ /* bpf_map_xxx(..., map_ptr, ..., value) call:
+ * check [value, value + map->value_size) validity
+ */
+ if (!*mapp) {
+ /* kernel subsystem misconfigured verifier */
+ verbose("invalid map_ptr to access map->value\n");
+ return -EACCES;
+ }
+ err = check_stack_boundary(env, regno, (*mapp)->value_size);
+
+ } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ /* bpf_xxx(..., buf, len) call will access 'len' bytes
+ * from stack pointer 'buf'. Check it
+ * note: regno == len, regno - 1 == buf
+ */
+ if (regno == 0) {
+ /* kernel subsystem misconfigured verifier */
+ verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
+ return -EACCES;
+ }
+ err = check_stack_boundary(env, regno - 1, reg->imm);
+ }
+
+ return err;
+}
+
+static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+{
+ if (!map)
+ return 0;
+
+ /* We need a two way check, first is from map perspective ... */
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_PROG_ARRAY:
+ if (func_id != BPF_FUNC_tail_call)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+ if (func_id != BPF_FUNC_perf_event_read &&
+ func_id != BPF_FUNC_perf_event_output)
+ goto error;
+ break;
+ default:
+ break;
+ }
+
+ /* ... and second from the function itself. */
+ switch (func_id) {
+ case BPF_FUNC_tail_call:
+ if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ goto error;
+ break;
+ case BPF_FUNC_perf_event_read:
+ case BPF_FUNC_perf_event_output:
+ if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+ goto error;
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+error:
+ verbose("cannot pass map_type %d into func %d\n",
+ map->map_type, func_id);
+ return -EINVAL;
+}
+
+static int check_call(struct verifier_env *env, int func_id, int insn_idx)
+{
+ struct verifier_state *state = &env->cur_state;
+ const struct bpf_func_proto *fn = NULL;
+ struct reg_state *regs = state->regs;
+ struct bpf_map *map = NULL;
+ struct reg_state *reg;
+ int i, err;
+
+ /* find function prototype */
+ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
+ verbose("invalid func %d\n", func_id);
+ return -EINVAL;
+ }
+
+ if (env->prog->aux->ops->get_func_proto)
+ fn = env->prog->aux->ops->get_func_proto(func_id);
+
+ if (!fn) {
+ verbose("unknown func %d\n", func_id);
+ return -EINVAL;
+ }
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ if (!env->prog->gpl_compatible && fn->gpl_only) {
+ verbose("cannot call GPL only function from proprietary program\n");
+ return -EINVAL;
+ }
+
+ /* check args */
+ err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map);
+ if (err)
+ return err;
+ if (func_id == BPF_FUNC_tail_call) {
+ if (map == NULL) {
+ verbose("verifier bug\n");
+ return -EINVAL;
+ }
+ env->insn_aux_data[insn_idx].map_ptr = map;
+ }
+ err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map);
+ if (err)
+ return err;
+
+ /* reset caller saved regs */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ reg = regs + caller_saved[i];
+ reg->type = NOT_INIT;
+ reg->imm = 0;
+ }
+
+ /* update return register */
+ if (fn->ret_type == RET_INTEGER) {
+ regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ } else if (fn->ret_type == RET_VOID) {
+ regs[BPF_REG_0].type = NOT_INIT;
+ } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ /* remember map_ptr, so that check_map_access()
+ * can check 'value_size' boundary of memory access
+ * to map element returned from bpf_map_lookup_elem()
+ */
+ if (map == NULL) {
+ verbose("kernel subsystem misconfigured verifier\n");
+ return -EINVAL;
+ }
+ regs[BPF_REG_0].map_ptr = map;
+ } else {
+ verbose("unknown return type %d of func %d\n",
+ fn->ret_type, func_id);
+ return -EINVAL;
+ }
+
+ err = check_map_func_compatibility(map, func_id);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+/* check validity of 32-bit and 64-bit arithmetic operations */
+static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ u8 opcode = BPF_OP(insn->code);
+ int err;
+
+ if (opcode == BPF_END || opcode == BPF_NEG) {
+ if (opcode == BPF_NEG) {
+ if (BPF_SRC(insn->code) != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->off != 0 || insn->imm != 0) {
+ verbose("BPF_NEG uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
+ (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
+ BPF_CLASS(insn->code) == BPF_ALU64) {
+ verbose("BPF_END uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ if (is_pointer_value(env, insn->dst_reg)) {
+ verbose("R%d pointer arithmetic prohibited\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
+ /* check dest operand */
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ } else if (opcode == BPF_MOV) {
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || insn->off != 0) {
+ verbose("BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose("BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check dest operand */
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (BPF_CLASS(insn->code) == BPF_ALU64) {
+ /* case: R1 = R2
+ * copy register state to dest reg
+ */
+ regs[insn->dst_reg] = regs[insn->src_reg];
+ } else {
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d partial copy of pointer\n",
+ insn->src_reg);
+ return -EACCES;
+ }
+ regs[insn->dst_reg].type = UNKNOWN_VALUE;
+ regs[insn->dst_reg].map_ptr = NULL;
+ }
+ } else if (BPF_CLASS(insn->code) == BPF_ALU64 ||
+ insn->imm >= 0) {
+ /* case: R = imm
+ * remember the value we stored into this reg
+ */
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+
+ } else if (opcode > BPF_END) {
+ verbose("invalid BPF_ALU opcode %x\n", opcode);
+ return -EINVAL;
+
+ } else { /* all other ALU ops: and, sub, xor, add, ... */
+
+ bool stack_relative = false;
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || insn->off != 0) {
+ verbose("BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose("BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
+ BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
+ verbose("div by zero\n");
+ return -EINVAL;
+ }
+
+ if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) {
+ verbose("BPF_ARSH not supported for 32 bit ALU\n");
+ return -EINVAL;
+ }
+
+ if ((opcode == BPF_LSH || opcode == BPF_RSH ||
+ opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
+ int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
+
+ if (insn->imm < 0 || insn->imm >= size) {
+ verbose("invalid shift %d\n", insn->imm);
+ return -EINVAL;
+ }
+ }
+
+ /* pattern match 'bpf_add Rx, imm' instruction */
+ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
+ regs[insn->dst_reg].type == FRAME_PTR &&
+ BPF_SRC(insn->code) == BPF_K) {
+ stack_relative = true;
+ } else if (is_pointer_value(env, insn->dst_reg)) {
+ verbose("R%d pointer arithmetic prohibited\n",
+ insn->dst_reg);
+ return -EACCES;
+ } else if (BPF_SRC(insn->code) == BPF_X &&
+ is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d pointer arithmetic prohibited\n",
+ insn->src_reg);
+ return -EACCES;
+ }
+
+ /* check dest operand */
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ if (stack_relative) {
+ regs[insn->dst_reg].type = PTR_TO_STACK;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+ }
+
+ return 0;
+}
+
+static int check_cond_jmp_op(struct verifier_env *env,
+ struct bpf_insn *insn, int *insn_idx)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ struct verifier_state *other_branch;
+ u8 opcode = BPF_OP(insn->code);
+ int err;
+
+ if (opcode > BPF_EXIT) {
+ verbose("invalid BPF_JMP opcode %x\n", opcode);
+ return -EINVAL;
+ }
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0) {
+ verbose("BPF_JMP uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d pointer comparison prohibited\n",
+ insn->src_reg);
+ return -EACCES;
+ }
+ } else {
+ if (insn->src_reg != BPF_REG_0) {
+ verbose("BPF_JMP uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* detect if R == 0 where R was initialized to zero earlier */
+ if (BPF_SRC(insn->code) == BPF_K &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+ regs[insn->dst_reg].type == CONST_IMM &&
+ regs[insn->dst_reg].imm == insn->imm) {
+ if (opcode == BPF_JEQ) {
+ /* if (imm == imm) goto pc+off;
+ * only follow the goto, ignore fall-through
+ */
+ *insn_idx += insn->off;
+ return 0;
+ } else {
+ /* if (imm != imm) goto pc+off;
+ * only follow fall-through branch, since
+ * that's where the program will go
+ */
+ return 0;
+ }
+ }
+
+ other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
+ if (!other_branch)
+ return -EFAULT;
+
+ /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
+ if (BPF_SRC(insn->code) == BPF_K &&
+ insn->imm == 0 && (opcode == BPF_JEQ ||
+ opcode == BPF_JNE) &&
+ regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
+ if (opcode == BPF_JEQ) {
+ /* next fallthrough insn can access memory via
+ * this register
+ */
+ regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ /* branch targer cannot access it, since reg == 0 */
+ other_branch->regs[insn->dst_reg].type = CONST_IMM;
+ other_branch->regs[insn->dst_reg].imm = 0;
+ } else {
+ other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = 0;
+ }
+ } else if (is_pointer_value(env, insn->dst_reg)) {
+ verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+ return -EACCES;
+ } else if (BPF_SRC(insn->code) == BPF_K &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE)) {
+
+ if (opcode == BPF_JEQ) {
+ /* detect if (R == imm) goto
+ * and in the target state recognize that R = imm
+ */
+ other_branch->regs[insn->dst_reg].type = CONST_IMM;
+ other_branch->regs[insn->dst_reg].imm = insn->imm;
+ } else {
+ /* detect if (R != imm) goto
+ * and in the fall-through state recognize that R = imm
+ */
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+ }
+ if (log_level)
+ print_verifier_state(env);
+ return 0;
+}
+
+/* return the map pointer stored inside BPF_LD_IMM64 instruction */
+static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
+{
+ u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
+
+ return (struct bpf_map *) (unsigned long) imm64;
+}
+
+/* verify BPF_LD_IMM64 instruction */
+static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ int err;
+
+ if (BPF_SIZE(insn->code) != BPF_DW) {
+ verbose("invalid BPF_LD_IMM insn\n");
+ return -EINVAL;
+ }
+ if (insn->off != 0) {
+ verbose("BPF_LD_IMM64 uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ if (insn->src_reg == 0)
+ /* generic move 64-bit immediate into a register */
+ return 0;
+
+ /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
+ BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
+
+ regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
+ regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
+ return 0;
+}
+
+static bool may_access_skb(enum bpf_prog_type type)
+{
+ switch (type) {
+ case BPF_PROG_TYPE_SOCKET_FILTER:
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_SCHED_ACT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/* verify safety of LD_ABS|LD_IND instructions:
+ * - they can only appear in the programs where ctx == skb
+ * - since they are wrappers of function calls, they scratch R1-R5 registers,
+ * preserve R6-R9, and store return value into R0
+ *
+ * Implicit input:
+ * ctx == skb == R6 == CTX
+ *
+ * Explicit input:
+ * SRC == any register
+ * IMM == 32-bit immediate
+ *
+ * Output:
+ * R0 - 8/16/32-bit skb data converted to cpu endianness
+ */
+static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ u8 mode = BPF_MODE(insn->code);
+ struct reg_state *reg;
+ int i, err;
+
+ if (!may_access_skb(env->prog->type)) {
+ verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n");
+ return -EINVAL;
+ }
+
+ if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
+ BPF_SIZE(insn->code) == BPF_DW ||
+ (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
+ verbose("BPF_LD_ABS uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check whether implicit source operand (register R6) is readable */
+ err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
+ if (err)
+ return err;
+
+ if (regs[BPF_REG_6].type != PTR_TO_CTX) {
+ verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+ return -EINVAL;
+ }
+
+ if (mode == BPF_IND) {
+ /* check explicit source operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ }
+
+ /* reset caller saved regs to unreadable */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ reg = regs + caller_saved[i];
+ reg->type = NOT_INIT;
+ reg->imm = 0;
+ }
+
+ /* mark destination R0 register as readable, since it contains
+ * the value fetched from the packet
+ */
+ regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ return 0;
+}
+
+/* non-recursive DFS pseudo code
+ * 1 procedure DFS-iterative(G,v):
+ * 2 label v as discovered
+ * 3 let S be a stack
+ * 4 S.push(v)
+ * 5 while S is not empty
+ * 6 t <- S.pop()
+ * 7 if t is what we're looking for:
+ * 8 return t
+ * 9 for all edges e in G.adjacentEdges(t) do
+ * 10 if edge e is already labelled
+ * 11 continue with the next edge
+ * 12 w <- G.adjacentVertex(t,e)
+ * 13 if vertex w is not discovered and not explored
+ * 14 label e as tree-edge
+ * 15 label w as discovered
+ * 16 S.push(w)
+ * 17 continue at 5
+ * 18 else if vertex w is discovered
+ * 19 label e as back-edge
+ * 20 else
+ * 21 // vertex w is explored
+ * 22 label e as forward- or cross-edge
+ * 23 label t as explored
+ * 24 S.pop()
+ *
+ * convention:
+ * 0x10 - discovered
+ * 0x11 - discovered and fall-through edge labelled
+ * 0x12 - discovered and fall-through and branch edges labelled
+ * 0x20 - explored
+ */
+
+enum {
+ DISCOVERED = 0x10,
+ EXPLORED = 0x20,
+ FALLTHROUGH = 1,
+ BRANCH = 2,
+};
+
+#define STATE_LIST_MARK ((struct verifier_state_list *) -1L)
+
+static int *insn_stack; /* stack of insns to process */
+static int cur_stack; /* current stack index */
+static int *insn_state;
+
+/* t, w, e - match pseudo-code above:
+ * t - index of current instruction
+ * w - next instruction
+ * e - edge
+ */
+static int push_insn(int t, int w, int e, struct verifier_env *env)
+{
+ if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
+ return 0;
+
+ if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
+ return 0;
+
+ if (w < 0 || w >= env->prog->len) {
+ verbose("jump out of range from insn %d to %d\n", t, w);
+ return -EINVAL;
+ }
+
+ if (e == BRANCH)
+ /* mark branch target for state pruning */
+ env->explored_states[w] = STATE_LIST_MARK;
+
+ if (insn_state[w] == 0) {
+ /* tree-edge */
+ insn_state[t] = DISCOVERED | e;
+ insn_state[w] = DISCOVERED;
+ if (cur_stack >= env->prog->len)
+ return -E2BIG;
+ insn_stack[cur_stack++] = w;
+ return 1;
+ } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
+ verbose("back-edge from insn %d to %d\n", t, w);
+ return -EINVAL;
+ } else if (insn_state[w] == EXPLORED) {
+ /* forward- or cross-edge */
+ insn_state[t] = DISCOVERED | e;
+ } else {
+ verbose("insn state internal bug\n");
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/* non-recursive depth-first-search to detect loops in BPF program
+ * loop == back-edge in directed graph
+ */
+static int check_cfg(struct verifier_env *env)
+{
+ struct bpf_insn *insns = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int ret = 0;
+ int i, t;
+
+ insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ if (!insn_state)
+ return -ENOMEM;
+
+ insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ if (!insn_stack) {
+ kfree(insn_state);
+ return -ENOMEM;
+ }
+
+ insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
+ insn_stack[0] = 0; /* 0 is the first instruction */
+ cur_stack = 1;
+
+peek_stack:
+ if (cur_stack == 0)
+ goto check_state;
+ t = insn_stack[cur_stack - 1];
+
+ if (BPF_CLASS(insns[t].code) == BPF_JMP) {
+ u8 opcode = BPF_OP(insns[t].code);
+
+ if (opcode == BPF_EXIT) {
+ goto mark_explored;
+ } else if (opcode == BPF_CALL) {
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insns[t].code) != BPF_K) {
+ ret = -EINVAL;
+ goto err_free;
+ }
+ /* unconditional jump with single edge */
+ ret = push_insn(t, t + insns[t].off + 1,
+ FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ /* tell verifier to check for equivalent states
+ * after every call and jump
+ */
+ if (t + 1 < insn_cnt)
+ env->explored_states[t + 1] = STATE_LIST_MARK;
+ } else {
+ /* conditional jump with two edges */
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+
+ ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ }
+ } else {
+ /* all other non-branch instructions with single
+ * fall-through edge
+ */
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ }
+
+mark_explored:
+ insn_state[t] = EXPLORED;
+ if (cur_stack-- <= 0) {
+ verbose("pop stack internal bug\n");
+ ret = -EFAULT;
+ goto err_free;
+ }
+ goto peek_stack;
+
+check_state:
+ for (i = 0; i < insn_cnt; i++) {
+ if (insn_state[i] != EXPLORED) {
+ verbose("unreachable insn %d\n", i);
+ ret = -EINVAL;
+ goto err_free;
+ }
+ }
+ ret = 0; /* cfg looks good */
+
+err_free:
+ kfree(insn_state);
+ kfree(insn_stack);
+ return ret;
+}
+
+/* compare two verifier states
+ *
+ * all states stored in state_list are known to be valid, since
+ * verifier reached 'bpf_exit' instruction through them
+ *
+ * this function is called when verifier exploring different branches of
+ * execution popped from the state stack. If it sees an old state that has
+ * more strict register state and more strict stack state then this execution
+ * branch doesn't need to be explored further, since verifier already
+ * concluded that more strict state leads to valid finish.
+ *
+ * Therefore two states are equivalent if register state is more conservative
+ * and explored stack state is more conservative than the current one.
+ * Example:
+ * explored current
+ * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC)
+ * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC)
+ *
+ * In other words if current stack state (one being explored) has more
+ * valid slots than old one that already passed validation, it means
+ * the verifier can stop exploring and conclude that current state is valid too
+ *
+ * Similarly with registers. If explored state has register type as invalid
+ * whereas register type in current state is meaningful, it means that
+ * the current state will reach 'bpf_exit' instruction safely
+ */
+static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ if (memcmp(&old->regs[i], &cur->regs[i],
+ sizeof(old->regs[0])) != 0) {
+ if (old->regs[i].type == NOT_INIT ||
+ (old->regs[i].type == UNKNOWN_VALUE &&
+ cur->regs[i].type != NOT_INIT))
+ continue;
+ return false;
+ }
+ }
+
+ for (i = 0; i < MAX_BPF_STACK; i++) {
+ if (old->stack_slot_type[i] == STACK_INVALID)
+ continue;
+ if (old->stack_slot_type[i] != cur->stack_slot_type[i])
+ /* Ex: old explored (safe) state has STACK_SPILL in
+ * this stack slot, but current has has STACK_MISC ->
+ * this verifier states are not equivalent,
+ * return false to continue verification of this path
+ */
+ return false;
+ if (i % BPF_REG_SIZE)
+ continue;
+ if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
+ &cur->spilled_regs[i / BPF_REG_SIZE],
+ sizeof(old->spilled_regs[0])))
+ /* when explored and current stack slot types are
+ * the same, check that stored pointers types
+ * are the same as well.
+ * Ex: explored safe path could have stored
+ * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
+ * but current path has stored:
+ * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
+ * such verifier states are not equivalent.
+ * return false to continue verification of this path
+ */
+ return false;
+ else
+ continue;
+ }
+ return true;
+}
+
+static int is_state_visited(struct verifier_env *env, int insn_idx)
+{
+ struct verifier_state_list *new_sl;
+ struct verifier_state_list *sl;
+
+ sl = env->explored_states[insn_idx];
+ if (!sl)
+ /* this 'insn_idx' instruction wasn't marked, so we will not
+ * be doing state search here
+ */
+ return 0;
+
+ while (sl != STATE_LIST_MARK) {
+ if (states_equal(&sl->state, &env->cur_state))
+ /* reached equivalent register/stack state,
+ * prune the search
+ */
+ return 1;
+ sl = sl->next;
+ }
+
+ /* there were no equivalent states, remember current one.
+ * technically the current state is not proven to be safe yet,
+ * but it will either reach bpf_exit (which means it's safe) or
+ * it will be rejected. Since there are no loops, we won't be
+ * seeing this 'insn_idx' instruction again on the way to bpf_exit
+ */
+ new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER);
+ if (!new_sl)
+ return -ENOMEM;
+
+ /* add new state to the head of linked list */
+ memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+ new_sl->next = env->explored_states[insn_idx];
+ env->explored_states[insn_idx] = new_sl;
+ return 0;
+}
+
+static int do_check(struct verifier_env *env)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct reg_state *regs = state->regs;
+ int insn_cnt = env->prog->len;
+ int insn_idx, prev_insn_idx = 0;
+ int insn_processed = 0;
+ bool do_print_state = false;
+
+ init_reg_state(regs);
+ insn_idx = 0;
+ for (;;) {
+ struct bpf_insn *insn;
+ u8 class;
+ int err;
+
+ if (insn_idx >= insn_cnt) {
+ verbose("invalid insn idx %d insn_cnt %d\n",
+ insn_idx, insn_cnt);
+ return -EFAULT;
+ }
+
+ insn = &insns[insn_idx];
+ class = BPF_CLASS(insn->code);
+
+ if (++insn_processed > 32768) {
+ verbose("BPF program is too large. Proccessed %d insn\n",
+ insn_processed);
+ return -E2BIG;
+ }
+
+ err = is_state_visited(env, insn_idx);
+ if (err < 0)
+ return err;
+ if (err == 1) {
+ /* found equivalent state, can prune the search */
+ if (log_level) {
+ if (do_print_state)
+ verbose("\nfrom %d to %d: safe\n",
+ prev_insn_idx, insn_idx);
+ else
+ verbose("%d: safe\n", insn_idx);
+ }
+ goto process_bpf_exit;
+ }
+
+ if (log_level && do_print_state) {
+ verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
+ print_verifier_state(env);
+ do_print_state = false;
+ }
+
+ if (log_level) {
+ verbose("%d: ", insn_idx);
+ print_bpf_insn(env, insn);
+ }
+
+ env->insn_aux_data[insn_idx].seen = true;
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ err = check_alu_op(env, insn);
+ if (err)
+ return err;
+
+ } else if (class == BPF_LDX) {
+ enum bpf_reg_type *prev_src_type, src_reg_type;
+
+ /* check for reserved fields is already done */
+
+ /* check src operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ if (err)
+ return err;
+
+ src_reg_type = regs[insn->src_reg].type;
+
+ /* check that memory (src_reg + off) is readable,
+ * the state of dst_reg will be updated by this func
+ */
+ err = check_mem_access(env, insn_idx, insn->src_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ,
+ insn->dst_reg);
+ if (err)
+ return err;
+
+ if (BPF_SIZE(insn->code) != BPF_W &&
+ BPF_SIZE(insn->code) != BPF_DW) {
+ insn_idx++;
+ continue;
+ }
+
+ prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
+
+ if (*prev_src_type == NOT_INIT) {
+ /* saw a valid insn
+ * dst_reg = *(u32 *)(src_reg + off)
+ * save type to validate intersecting paths
+ */
+ *prev_src_type = src_reg_type;
+
+ } else if (src_reg_type != *prev_src_type &&
+ (src_reg_type == PTR_TO_CTX ||
+ *prev_src_type == PTR_TO_CTX)) {
+ /* ABuser program is trying to use the same insn
+ * dst_reg = *(u32*) (src_reg + off)
+ * with different pointer types:
+ * src_reg == ctx in one branch and
+ * src_reg == stack|map in some other branch.
+ * Reject it.
+ */
+ verbose("same insn cannot be used with different pointers\n");
+ return -EINVAL;
+ }
+
+ } else if (class == BPF_STX) {
+ enum bpf_reg_type *prev_dst_type, dst_reg_type;
+
+ if (BPF_MODE(insn->code) == BPF_XADD) {
+ err = check_xadd(env, insn_idx, insn);
+ if (err)
+ return err;
+ insn_idx++;
+ continue;
+ }
+
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ dst_reg_type = regs[insn->dst_reg].type;
+
+ /* check that memory (dst_reg + off) is writeable */
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ insn->src_reg);
+ if (err)
+ return err;
+
+ prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type;
+
+ if (*prev_dst_type == NOT_INIT) {
+ *prev_dst_type = dst_reg_type;
+ } else if (dst_reg_type != *prev_dst_type &&
+ (dst_reg_type == PTR_TO_CTX ||
+ *prev_dst_type == PTR_TO_CTX)) {
+ verbose("same insn cannot be used with different pointers\n");
+ return -EINVAL;
+ }
+
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->src_reg != BPF_REG_0) {
+ verbose("BPF_ST uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ if (is_ctx_reg(env, insn->dst_reg)) {
+ verbose("BPF_ST stores into R%d context is not allowed\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
+ /* check that memory (dst_reg + off) is writeable */
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ -1);
+ if (err)
+ return err;
+
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->off != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_CALL uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ err = check_call(env, insn->imm, insn_idx);
+ if (err)
+ return err;
+
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_JA uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ insn_idx += insn->off + 1;
+ continue;
+
+ } else if (opcode == BPF_EXIT) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_EXIT uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* eBPF calling convetion is such that R0 is used
+ * to return the value from eBPF program.
+ * Make sure that it's readable at this time
+ * of bpf_exit, which means that program wrote
+ * something into it earlier
+ */
+ err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
+ if (err)
+ return err;
+
+ if (is_pointer_value(env, BPF_REG_0)) {
+ verbose("R0 leaks addr as return value\n");
+ return -EACCES;
+ }
+
+process_bpf_exit:
+ insn_idx = pop_stack(env, &prev_insn_idx);
+ if (insn_idx < 0) {
+ break;
+ } else {
+ do_print_state = true;
+ continue;
+ }
+ } else {
+ err = check_cond_jmp_op(env, insn, &insn_idx);
+ if (err)
+ return err;
+ }
+ } else if (class == BPF_LD) {
+ u8 mode = BPF_MODE(insn->code);
+
+ if (mode == BPF_ABS || mode == BPF_IND) {
+ err = check_ld_abs(env, insn);
+ if (err)
+ return err;
+
+ } else if (mode == BPF_IMM) {
+ err = check_ld_imm(env, insn);
+ if (err)
+ return err;
+
+ insn_idx++;
+ env->insn_aux_data[insn_idx].seen = true;
+ } else {
+ verbose("invalid BPF_LD mode\n");
+ return -EINVAL;
+ }
+ } else {
+ verbose("unknown insn class %d\n", class);
+ return -EINVAL;
+ }
+
+ insn_idx++;
+ }
+
+ return 0;
+}
+
+/* look for pseudo eBPF instructions that access map FDs and
+ * replace them with actual map pointers
+ */
+static int replace_map_fd_with_map_ptr(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i, j;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (BPF_CLASS(insn->code) == BPF_LDX &&
+ (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
+ verbose("BPF_LDX uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ if (BPF_CLASS(insn->code) == BPF_STX &&
+ ((BPF_MODE(insn->code) != BPF_MEM &&
+ BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
+ verbose("BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
+ struct bpf_map *map;
+ struct fd f;
+
+ if (i == insn_cnt - 1 || insn[1].code != 0 ||
+ insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
+ insn[1].off != 0) {
+ verbose("invalid bpf_ld_imm64 insn\n");
+ return -EINVAL;
+ }
+
+ if (insn->src_reg == 0)
+ /* valid generic load 64-bit imm */
+ goto next_insn;
+
+ if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
+ verbose("unrecognized bpf_ld_imm64 insn\n");
+ return -EINVAL;
+ }
+
+ f = fdget(insn->imm);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map)) {
+ verbose("fd %d is not pointing to valid bpf_map\n",
+ insn->imm);
+ return PTR_ERR(map);
+ }
+
+ /* store map pointer inside BPF_LD_IMM64 instruction */
+ insn[0].imm = (u32) (unsigned long) map;
+ insn[1].imm = ((u64) (unsigned long) map) >> 32;
+
+ /* check whether we recorded this map already */
+ for (j = 0; j < env->used_map_cnt; j++)
+ if (env->used_maps[j] == map) {
+ fdput(f);
+ goto next_insn;
+ }
+
+ if (env->used_map_cnt >= MAX_USED_MAPS) {
+ fdput(f);
+ return -E2BIG;
+ }
+
+ /* hold the map. If the program is rejected by verifier,
+ * the map will be released by release_maps() or it
+ * will be used by the valid program until it's unloaded
+ * and all maps are released in free_used_maps()
+ */
+ map = bpf_map_inc(map, false);
+ if (IS_ERR(map)) {
+ fdput(f);
+ return PTR_ERR(map);
+ }
+ env->used_maps[env->used_map_cnt++] = map;
+
+ fdput(f);
+next_insn:
+ insn++;
+ i++;
+ }
+ }
+
+ /* now all pseudo BPF_LD_IMM64 instructions load valid
+ * 'struct bpf_map *' into a register instead of user map_fd.
+ * These pointers will be used later by verifier to validate map access.
+ */
+ return 0;
+}
+
+/* drop refcnt of maps used by the rejected program */
+static void release_maps(struct verifier_env *env)
+{
+ int i;
+
+ for (i = 0; i < env->used_map_cnt; i++)
+ bpf_map_put(env->used_maps[i]);
+}
+
+/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
+static void convert_pseudo_ld_imm64(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++)
+ if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
+ insn->src_reg = 0;
+}
+
+/* single env->prog->insni[off] instruction was replaced with the range
+ * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
+ * [0, off) and [off, end) to new locations, so the patched range stays zero
+ */
+static int adjust_insn_aux_data(struct verifier_env *env, u32 prog_len,
+ u32 off, u32 cnt)
+{
+ struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+ int i;
+
+ if (cnt == 1)
+ return 0;
+ new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len);
+ if (!new_data)
+ return -ENOMEM;
+ memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
+ memcpy(new_data + off + cnt - 1, old_data + off,
+ sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ for (i = off; i < off + cnt - 1; i++)
+ new_data[i].seen = true;
+ env->insn_aux_data = new_data;
+ vfree(old_data);
+ return 0;
+}
+
+static struct bpf_prog *bpf_patch_insn_data(struct verifier_env *env, u32 off,
+ const struct bpf_insn *patch, u32 len)
+{
+ struct bpf_prog *new_prog;
+
+ new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
+ if (!new_prog)
+ return NULL;
+ if (adjust_insn_aux_data(env, new_prog->len, off, len))
+ return NULL;
+ return new_prog;
+}
+
+/* The verifier does more data flow analysis than llvm and will not explore
+ * branches that are dead at run time. Malicious programs can have dead code
+ * too. Therefore replace all dead at-run-time code with nops.
+ */
+static void sanitize_dead_code(struct verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (aux_data[i].seen)
+ continue;
+ memcpy(insn + i, &nop, sizeof(nop));
+ }
+}
+
+/* convert load instructions that access fields of 'struct __sk_buff'
+ * into sequence of instructions that access fields of 'struct sk_buff'
+ */
+static int convert_ctx_accesses(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ struct bpf_insn insn_buf[16];
+ struct bpf_prog *new_prog;
+ enum bpf_access_type type;
+ int i, delta = 0;
+
+ if (!env->prog->aux->ops->convert_ctx_access)
+ return 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ u32 cnt;
+
+ if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
+ type = BPF_READ;
+ else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
+ insn->code == (BPF_STX | BPF_MEM | BPF_DW))
+ type = BPF_WRITE;
+ else
+ continue;
+
+ if (type == BPF_WRITE &&
+ env->insn_aux_data[i + delta].sanitize_stack_off) {
+ struct bpf_insn patch[] = {
+ /* Sanitize suspicious stack slot with zero.
+ * There are no memory dependencies for this store,
+ * since it's only using frame pointer and immediate
+ * constant of zero
+ */
+ BPF_ST_MEM(BPF_DW, BPF_REG_FP,
+ env->insn_aux_data[i + delta].sanitize_stack_off,
+ 0),
+ /* the original STX instruction will immediately
+ * overwrite the same stack slot with appropriate value
+ */
+ *insn,
+ };
+
+ cnt = ARRAY_SIZE(patch);
+ new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
+ continue;
+
+ cnt = env->prog->aux->ops->
+ convert_ctx_access(type, insn->dst_reg, insn->src_reg,
+ insn->off, insn_buf, env->prog);
+ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ verbose("bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+
+ /* keep walking new program and skip insns we just inserted */
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ }
+
+ return 0;
+}
+
+/* fixup insn->imm field of bpf_call instructions
+ *
+ * this function is called after eBPF program passed verification
+ */
+static int fixup_bpf_calls(struct verifier_env *env)
+{
+ struct bpf_prog *prog = env->prog;
+ struct bpf_insn *insn = prog->insnsi;
+ const struct bpf_func_proto *fn;
+ const int insn_cnt = prog->len;
+ struct bpf_insn insn_buf[16];
+ struct bpf_prog *new_prog;
+ struct bpf_map *map_ptr;
+ int i, cnt, delta = 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
+ insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
+ /* due to JIT bugs clear upper 32-bits of src register
+ * before div/mod operation
+ */
+ insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg);
+ insn_buf[1] = *insn;
+ cnt = 2;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ if (insn->code != (BPF_JMP | BPF_CALL))
+ continue;
+
+ if (insn->imm == BPF_FUNC_get_route_realm)
+ prog->dst_needed = 1;
+ if (insn->imm == BPF_FUNC_get_prandom_u32)
+ bpf_user_rnd_init_once();
+ if (insn->imm == BPF_FUNC_tail_call) {
+ /* mark bpf_tail_call as different opcode to avoid
+ * conditional branch in the interpeter for every normal
+ * call and to prevent accidental JITing by JIT compiler
+ * that doesn't support bpf_tail_call yet
+ */
+ insn->imm = 0;
+ insn->code |= BPF_X;
+
+ /* instead of changing every JIT dealing with tail_call
+ * emit two extra insns:
+ * if (index >= max_entries) goto out;
+ * index &= array->index_mask;
+ * to avoid out-of-bounds cpu speculation
+ */
+ map_ptr = env->insn_aux_data[i + delta].map_ptr;
+ if (!map_ptr->unpriv_array)
+ continue;
+ insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
+ map_ptr->max_entries, 2);
+ insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
+ container_of(map_ptr,
+ struct bpf_array,
+ map)->index_mask);
+ insn_buf[2] = *insn;
+ cnt = 3;
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ fn = prog->aux->ops->get_func_proto(insn->imm);
+ /* all functions that have prototype and verifier allowed
+ * programs to call them, must be real in-kernel functions
+ */
+ if (!fn->func) {
+ verbose("kernel subsystem misconfigured func %d\n",
+ insn->imm);
+ return -EFAULT;
+ }
+ insn->imm = fn->func - __bpf_call_base;
+ }
+
+ return 0;
+}
+
+static void free_states(struct verifier_env *env)
+{
+ struct verifier_state_list *sl, *sln;
+ int i;
+
+ if (!env->explored_states)
+ return;
+
+ for (i = 0; i < env->prog->len; i++) {
+ sl = env->explored_states[i];
+
+ if (sl)
+ while (sl != STATE_LIST_MARK) {
+ sln = sl->next;
+ kfree(sl);
+ sl = sln;
+ }
+ }
+
+ kfree(env->explored_states);
+}
+
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
+{
+ char __user *log_ubuf = NULL;
+ struct verifier_env *env;
+ int ret = -EINVAL;
+
+ if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
+ return -E2BIG;
+
+ /* 'struct verifier_env' can be global, but since it's not small,
+ * allocate/free it every time bpf_check() is called
+ */
+ env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+ if (!env)
+ return -ENOMEM;
+
+ env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
+ (*prog)->len);
+ ret = -ENOMEM;
+ if (!env->insn_aux_data)
+ goto err_free_env;
+ env->prog = *prog;
+
+ /* grab the mutex to protect few globals used by verifier */
+ mutex_lock(&bpf_verifier_lock);
+
+ if (attr->log_level || attr->log_buf || attr->log_size) {
+ /* user requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ log_level = attr->log_level;
+ log_ubuf = (char __user *) (unsigned long) attr->log_buf;
+ log_size = attr->log_size;
+ log_len = 0;
+
+ ret = -EINVAL;
+ /* log_* values have to be sane */
+ if (log_size < 128 || log_size > UINT_MAX >> 8 ||
+ log_level == 0 || log_ubuf == NULL)
+ goto err_unlock;
+
+ ret = -ENOMEM;
+ log_buf = vmalloc(log_size);
+ if (!log_buf)
+ goto err_unlock;
+ } else {
+ log_level = 0;
+ }
+
+ ret = replace_map_fd_with_map_ptr(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ env->explored_states = kcalloc(env->prog->len,
+ sizeof(struct verifier_state_list *),
+ GFP_USER);
+ ret = -ENOMEM;
+ if (!env->explored_states)
+ goto skip_full_check;
+
+ ret = check_cfg(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
+
+ ret = do_check(env);
+
+skip_full_check:
+ while (pop_stack(env, NULL) >= 0);
+ free_states(env);
+
+ if (ret == 0)
+ sanitize_dead_code(env);
+
+ if (ret == 0)
+ /* program is valid, convert *(u32*)(ctx + off) accesses */
+ ret = convert_ctx_accesses(env);
+
+ if (ret == 0)
+ ret = fixup_bpf_calls(env);
+
+ if (log_level && log_len >= log_size - 1) {
+ BUG_ON(log_len >= log_size);
+ /* verifier log exceeded user supplied buffer */
+ ret = -ENOSPC;
+ /* fall through to return what was recorded */
+ }
+
+ /* copy verifier log back to user space including trailing zero */
+ if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+ ret = -EFAULT;
+ goto free_log_buf;
+ }
+
+ if (ret == 0 && env->used_map_cnt) {
+ /* if program passed verifier, update used_maps in bpf_prog_info */
+ env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
+ sizeof(env->used_maps[0]),
+ GFP_KERNEL);
+
+ if (!env->prog->aux->used_maps) {
+ ret = -ENOMEM;
+ goto free_log_buf;
+ }
+
+ memcpy(env->prog->aux->used_maps, env->used_maps,
+ sizeof(env->used_maps[0]) * env->used_map_cnt);
+ env->prog->aux->used_map_cnt = env->used_map_cnt;
+
+ /* program is valid. Convert pseudo bpf_ld_imm64 into generic
+ * bpf_ld_imm64 instructions
+ */
+ convert_pseudo_ld_imm64(env);
+ }
+
+free_log_buf:
+ if (log_level)
+ vfree(log_buf);
+ if (!env->prog->aux->used_maps)
+ /* if we didn't copy map pointers into bpf_prog_info, release
+ * them now. Otherwise free_used_maps() will release them.
+ */
+ release_maps(env);
+ *prog = env->prog;
+err_unlock:
+ mutex_unlock(&bpf_verifier_lock);
+ vfree(env->insn_aux_data);
+err_free_env:
+ kfree(env);
+ return ret;
+}