summaryrefslogtreecommitdiff
path: root/kernel/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/Makefile6
-rw-r--r--kernel/bpf/arraymap.c359
-rw-r--r--kernel/bpf/cgroup.c459
-rw-r--r--kernel/bpf/core.c544
-rw-r--r--kernel/bpf/hashtab.c642
-rw-r--r--kernel/bpf/helpers.c74
-rw-r--r--kernel/bpf/inode.c78
-rw-r--r--kernel/bpf/percpu_freelist.c104
-rw-r--r--kernel/bpf/percpu_freelist.h31
-rw-r--r--kernel/bpf/stackmap.c293
-rw-r--r--kernel/bpf/syscall.c477
-rw-r--r--kernel/bpf/verifier.c1767
12 files changed, 4247 insertions, 587 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 677991f29d66..ad110b5aef69 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,4 +2,8 @@ obj-y := core.o
CFLAGS_core.o += $(call cc-disable-warning, override-init)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o
+ifeq ($(CONFIG_PERF_EVENTS),y)
+obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
+endif
+obj-$(CONFIG_CGROUP_BPF) += cgroup.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index daa4e0782cf7..b30ca0f2fa41 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -11,23 +11,57 @@
*/
#include <linux/bpf.h>
#include <linux/err.h>
-#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/filter.h>
#include <linux/perf_event.h>
+#define ARRAY_CREATE_FLAG_MASK \
+ (BPF_F_RDONLY | BPF_F_WRONLY)
+
+static void bpf_array_free_percpu(struct bpf_array *array)
+{
+ int i;
+
+ for (i = 0; i < array->map.max_entries; i++) {
+ free_percpu(array->pptrs[i]);
+ cond_resched();
+ }
+}
+
+static int bpf_array_alloc_percpu(struct bpf_array *array)
+{
+ void __percpu *ptr;
+ int i;
+
+ for (i = 0; i < array->map.max_entries; i++) {
+ ptr = __alloc_percpu_gfp(array->elem_size, 8,
+ GFP_USER | __GFP_NOWARN);
+ if (!ptr) {
+ bpf_array_free_percpu(array);
+ return -ENOMEM;
+ }
+ array->pptrs[i] = ptr;
+ cond_resched();
+ }
+
+ return 0;
+}
+
/* Called from syscall */
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
- u32 elem_size, array_size, index_mask, max_entries;
+ bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ u32 elem_size, index_mask, max_entries;
bool unpriv = !capable(CAP_SYS_ADMIN);
+ u64 cost, array_size, mask64;
struct bpf_array *array;
- u64 mask64;
+ int ret;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size == 0)
+ attr->value_size == 0 ||
+ attr->map_flags & ~ARRAY_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1))
@@ -59,30 +93,50 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
return ERR_PTR(-E2BIG);
}
- /* check round_up into zero and u32 overflow */
- if (elem_size == 0 ||
- max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size)
+ array_size = sizeof(*array);
+ if (percpu)
+ array_size += (u64) max_entries * sizeof(void *);
+ else
+ array_size += (u64) max_entries * elem_size;
+
+ /* make sure there is no u32 overflow later in round_up() */
+ cost = array_size;
+ if (cost >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-ENOMEM);
+ if (percpu) {
+ cost += (u64)attr->max_entries * elem_size * num_possible_cpus();
+ if (cost >= U32_MAX - PAGE_SIZE)
+ return ERR_PTR(-ENOMEM);
+ }
+ cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
- array_size = sizeof(*array) + max_entries * elem_size;
+ ret = bpf_map_precharge_memlock(cost);
+ if (ret < 0)
+ return ERR_PTR(ret);
/* allocate all map elements and zero-initialize them */
- array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
- if (!array) {
- array = vzalloc(array_size);
- if (!array)
- return ERR_PTR(-ENOMEM);
- }
+ array = bpf_map_area_alloc(array_size);
+ if (!array)
+ return ERR_PTR(-ENOMEM);
array->index_mask = index_mask;
array->map.unpriv_array = unpriv;
/* copy mandatory map attributes */
+ array->map.map_type = attr->map_type;
array->map.key_size = attr->key_size;
array->map.value_size = attr->value_size;
array->map.max_entries = attr->max_entries;
- array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
+ array->map.map_flags = attr->map_flags;
+ array->map.pages = cost;
array->elem_size = elem_size;
+ if (percpu &&
+ (elem_size > PCPU_MIN_UNIT_SIZE ||
+ bpf_array_alloc_percpu(array))) {
+ bpf_map_area_free(array);
+ return ERR_PTR(-ENOMEM);
+ }
+
return &array->map;
}
@@ -92,12 +146,50 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
- if (index >= array->map.max_entries)
+ if (unlikely(index >= array->map.max_entries))
return NULL;
return array->value + array->elem_size * (index & array->index_mask);
}
+/* Called from eBPF program */
+static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+
+ if (unlikely(index >= array->map.max_entries))
+ return NULL;
+
+ return this_cpu_ptr(array->pptrs[index & array->index_mask]);
+}
+
+int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+ void __percpu *pptr;
+ int cpu, off = 0;
+ u32 size;
+
+ if (unlikely(index >= array->map.max_entries))
+ return -ENOENT;
+
+ /* per_cpu areas are zero-filled and bpf programs can only
+ * access 'value_size' of them, so copying rounded areas
+ * will not leak any kernel data
+ */
+ size = round_up(map->value_size, 8);
+ rcu_read_lock();
+ pptr = array->pptrs[index & array->index_mask];
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
+ off += size;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
/* Called from syscall */
static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
@@ -124,21 +216,63 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
- if (map_flags > BPF_EXIST)
+ if (unlikely(map_flags > BPF_EXIST))
/* unknown flags */
return -EINVAL;
- if (index >= array->map.max_entries)
+ if (unlikely(index >= array->map.max_entries))
/* all elements were pre-allocated, cannot insert a new one */
return -E2BIG;
- if (map_flags == BPF_NOEXIST)
+ if (unlikely(map_flags == BPF_NOEXIST))
/* all elements already exist */
return -EEXIST;
- memcpy(array->value +
- array->elem_size * (index & array->index_mask),
- value, map->value_size);
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
+ value, map->value_size);
+ else
+ memcpy(array->value +
+ array->elem_size * (index & array->index_mask),
+ value, map->value_size);
+ return 0;
+}
+
+int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 index = *(u32 *)key;
+ void __percpu *pptr;
+ int cpu, off = 0;
+ u32 size;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ /* unknown flags */
+ return -EINVAL;
+
+ if (unlikely(index >= array->map.max_entries))
+ /* all elements were pre-allocated, cannot insert a new one */
+ return -E2BIG;
+
+ if (unlikely(map_flags == BPF_NOEXIST))
+ /* all elements already exist */
+ return -EEXIST;
+
+ /* the user space will provide round_up(value_size, 8) bytes that
+ * will be copied into per-cpu area. bpf programs can only access
+ * value_size of it. During lookup the same extra bytes will be
+ * returned or zeros which were zero-filled by percpu_alloc,
+ * so no kernel data leaks possible
+ */
+ size = round_up(map->value_size, 8);
+ rcu_read_lock();
+ pptr = array->pptrs[index & array->index_mask];
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
+ off += size;
+ }
+ rcu_read_unlock();
return 0;
}
@@ -160,7 +294,10 @@ static void array_map_free(struct bpf_map *map)
*/
synchronize_rcu();
- kvfree(array);
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ bpf_array_free_percpu(array);
+
+ bpf_map_area_free(array);
}
static const struct bpf_map_ops array_ops = {
@@ -177,9 +314,24 @@ static struct bpf_map_type_list array_type __read_mostly = {
.type = BPF_MAP_TYPE_ARRAY,
};
+static const struct bpf_map_ops percpu_array_ops = {
+ .map_alloc = array_map_alloc,
+ .map_free = array_map_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = percpu_array_map_lookup_elem,
+ .map_update_elem = array_map_update_elem,
+ .map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list percpu_array_type __read_mostly = {
+ .ops = &percpu_array_ops,
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+};
+
static int __init register_array_map(void)
{
bpf_register_map_type(&array_type);
+ bpf_register_map_type(&percpu_array_type);
return 0;
}
late_initcall(register_array_map);
@@ -202,7 +354,8 @@ static void fd_array_map_free(struct bpf_map *map)
/* make sure it's empty */
for (i = 0; i < array->map.max_entries; i++)
BUG_ON(array->ptrs[i] != NULL);
- kvfree(array);
+
+ bpf_map_area_free(array);
}
static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
@@ -211,8 +364,8 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
}
/* only called from syscall */
-static int fd_array_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
+ void *key, void *value, u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
void *new_ptr, *old_ptr;
@@ -225,7 +378,7 @@ static int fd_array_map_update_elem(struct bpf_map *map, void *key,
return -E2BIG;
ufd = *(u32 *)value;
- new_ptr = map->ops->map_fd_get_ptr(map, ufd);
+ new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
if (IS_ERR(new_ptr))
return PTR_ERR(new_ptr);
@@ -254,10 +407,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
}
}
-static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void *prog_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file, int fd)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog = bpf_prog_get(fd);
+
if (IS_ERR(prog))
return prog;
@@ -265,6 +420,7 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
bpf_prog_put(prog);
return ERR_PTR(-EINVAL);
}
+
return prog;
}
@@ -288,7 +444,6 @@ static const struct bpf_map_ops prog_array_ops = {
.map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
- .map_update_elem = fd_array_map_update_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = prog_fd_array_get_ptr,
.map_fd_put_ptr = prog_fd_array_put_ptr,
@@ -306,58 +461,105 @@ static int __init register_prog_array_map(void)
}
late_initcall(register_prog_array_map);
-static void perf_event_array_map_free(struct bpf_map *map)
+static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
+ struct file *map_file)
{
- bpf_fd_array_map_clear(map);
- fd_array_map_free(map);
+ struct bpf_event_entry *ee;
+
+ ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
+ if (ee) {
+ ee->event = perf_file->private_data;
+ ee->perf_file = perf_file;
+ ee->map_file = map_file;
+ }
+
+ return ee;
}
-static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
+static void __bpf_event_entry_free(struct rcu_head *rcu)
{
- struct perf_event *event;
- const struct perf_event_attr *attr;
+ struct bpf_event_entry *ee;
- event = perf_event_get(fd);
- if (IS_ERR(event))
- return event;
+ ee = container_of(rcu, struct bpf_event_entry, rcu);
+ fput(ee->perf_file);
+ kfree(ee);
+}
- attr = perf_event_attrs(event);
- if (IS_ERR(attr))
- goto err;
+static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
+{
+ call_rcu(&ee->rcu, __bpf_event_entry_free);
+}
- if (attr->inherit)
- goto err;
+static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file, int fd)
+{
+ const struct perf_event_attr *attr;
+ struct bpf_event_entry *ee;
+ struct perf_event *event;
+ struct file *perf_file;
- if (attr->type == PERF_TYPE_RAW)
- return event;
+ perf_file = perf_event_get(fd);
+ if (IS_ERR(perf_file))
+ return perf_file;
- if (attr->type == PERF_TYPE_HARDWARE)
- return event;
+ event = perf_file->private_data;
+ ee = ERR_PTR(-EINVAL);
+
+ attr = perf_event_attrs(event);
+ if (IS_ERR(attr) || attr->inherit)
+ goto err_out;
+
+ switch (attr->type) {
+ case PERF_TYPE_SOFTWARE:
+ if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
+ goto err_out;
+ /* fall-through */
+ case PERF_TYPE_RAW:
+ case PERF_TYPE_HARDWARE:
+ ee = bpf_event_entry_gen(perf_file, map_file);
+ if (ee)
+ return ee;
+ ee = ERR_PTR(-ENOMEM);
+ /* fall-through */
+ default:
+ break;
+ }
- if (attr->type == PERF_TYPE_SOFTWARE &&
- attr->config == PERF_COUNT_SW_BPF_OUTPUT)
- return event;
-err:
- perf_event_release_kernel(event);
- return ERR_PTR(-EINVAL);
+err_out:
+ fput(perf_file);
+ return ee;
}
static void perf_event_fd_array_put_ptr(void *ptr)
{
- struct perf_event *event = ptr;
+ bpf_event_entry_free_rcu(ptr);
+}
+
+static void perf_event_fd_array_release(struct bpf_map *map,
+ struct file *map_file)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_event_entry *ee;
+ int i;
- perf_event_release_kernel(event);
+ rcu_read_lock();
+ for (i = 0; i < array->map.max_entries; i++) {
+ ee = READ_ONCE(array->ptrs[i]);
+ if (ee && ee->map_file == map_file)
+ fd_array_map_delete_elem(map, &i);
+ }
+ rcu_read_unlock();
}
static const struct bpf_map_ops perf_event_array_ops = {
.map_alloc = fd_array_map_alloc,
- .map_free = perf_event_array_map_free,
+ .map_free = fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
- .map_update_elem = fd_array_map_update_elem,
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = perf_event_fd_array_get_ptr,
.map_fd_put_ptr = perf_event_fd_array_put_ptr,
+ .map_release = perf_event_fd_array_release,
};
static struct bpf_map_type_list perf_event_array_type __read_mostly = {
@@ -371,3 +573,46 @@ static int __init register_perf_event_array_map(void)
return 0;
}
late_initcall(register_perf_event_array_map);
+
+#ifdef CONFIG_CGROUPS
+static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
+ struct file *map_file /* not used */,
+ int fd)
+{
+ return cgroup_get_from_fd(fd);
+}
+
+static void cgroup_fd_array_put_ptr(void *ptr)
+{
+ /* cgroup_put free cgrp after a rcu grace period */
+ cgroup_put(ptr);
+}
+
+static void cgroup_fd_array_free(struct bpf_map *map)
+{
+ bpf_fd_array_map_clear(map);
+ fd_array_map_free(map);
+}
+
+static const struct bpf_map_ops cgroup_array_ops = {
+ .map_alloc = fd_array_map_alloc,
+ .map_free = cgroup_fd_array_free,
+ .map_get_next_key = array_map_get_next_key,
+ .map_lookup_elem = fd_array_map_lookup_elem,
+ .map_delete_elem = fd_array_map_delete_elem,
+ .map_fd_get_ptr = cgroup_fd_array_get_ptr,
+ .map_fd_put_ptr = cgroup_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list cgroup_array_type __read_mostly = {
+ .ops = &cgroup_array_ops,
+ .type = BPF_MAP_TYPE_CGROUP_ARRAY,
+};
+
+static int __init register_cgroup_array_map(void)
+{
+ bpf_register_map_type(&cgroup_array_type);
+ return 0;
+}
+late_initcall(register_cgroup_array_map);
+#endif
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
new file mode 100644
index 000000000000..8210c7dd7532
--- /dev/null
+++ b/kernel/bpf/cgroup.c
@@ -0,0 +1,459 @@
+/*
+ * Functions to manage eBPF programs attached to cgroups
+ *
+ * Copyright (c) 2016 Daniel Mack
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License. See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
+#include <net/sock.h>
+
+DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
+EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+
+/**
+ * cgroup_bpf_put() - put references of all bpf programs
+ * @cgrp: the cgroup to modify
+ */
+void cgroup_bpf_put(struct cgroup *cgrp)
+{
+ unsigned int type;
+
+ for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ struct bpf_prog_list *pl, *tmp;
+
+ list_for_each_entry_safe(pl, tmp, progs, node) {
+ list_del(&pl->node);
+ bpf_prog_put(pl->prog);
+ kfree(pl);
+ static_branch_dec(&cgroup_bpf_enabled_key);
+ }
+ bpf_prog_array_free(cgrp->bpf.effective[type]);
+ }
+}
+
+/* count number of elements in the list.
+ * it's slow but the list cannot be long
+ */
+static u32 prog_list_length(struct list_head *head)
+{
+ struct bpf_prog_list *pl;
+ u32 cnt = 0;
+
+ list_for_each_entry(pl, head, node) {
+ if (!pl->prog)
+ continue;
+ cnt++;
+ }
+ return cnt;
+}
+
+/* if parent has non-overridable prog attached,
+ * disallow attaching new programs to the descendent cgroup.
+ * if parent has overridable or multi-prog, allow attaching
+ */
+static bool hierarchy_allows_attach(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ u32 new_flags)
+{
+ struct cgroup *p;
+
+ p = cgroup_parent(cgrp);
+ if (!p)
+ return true;
+ do {
+ u32 flags = p->bpf.flags[type];
+ u32 cnt;
+
+ if (flags & BPF_F_ALLOW_MULTI)
+ return true;
+ cnt = prog_list_length(&p->bpf.progs[type]);
+ WARN_ON_ONCE(cnt > 1);
+ if (cnt == 1)
+ return !!(flags & BPF_F_ALLOW_OVERRIDE);
+ p = cgroup_parent(p);
+ } while (p);
+ return true;
+}
+
+/* compute a chain of effective programs for a given cgroup:
+ * start from the list of programs in this cgroup and add
+ * all parent programs.
+ * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
+ * to programs in this cgroup
+ */
+static int compute_effective_progs(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ struct bpf_prog_array __rcu **array)
+{
+ struct bpf_prog_array *progs;
+ struct bpf_prog_list *pl;
+ struct cgroup *p = cgrp;
+ int cnt = 0;
+
+ /* count number of effective programs by walking parents */
+ do {
+ if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ cnt += prog_list_length(&p->bpf.progs[type]);
+ p = cgroup_parent(p);
+ } while (p);
+
+ progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+ if (!progs)
+ return -ENOMEM;
+
+ /* populate the array with effective progs */
+ cnt = 0;
+ p = cgrp;
+ do {
+ if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ list_for_each_entry(pl,
+ &p->bpf.progs[type], node) {
+ if (!pl->prog)
+ continue;
+ progs->progs[cnt++] = pl->prog;
+ }
+ p = cgroup_parent(p);
+ } while (p);
+
+ rcu_assign_pointer(*array, progs);
+ return 0;
+}
+
+static void activate_effective_progs(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ struct bpf_prog_array __rcu *array)
+{
+ struct bpf_prog_array __rcu *old_array;
+
+ old_array = xchg(&cgrp->bpf.effective[type], array);
+ /* free prog array after grace period, since __cgroup_bpf_run_*()
+ * might be still walking the array
+ */
+ bpf_prog_array_free(old_array);
+}
+
+/**
+ * cgroup_bpf_inherit() - inherit effective programs from parent
+ * @cgrp: the cgroup to modify
+ */
+int cgroup_bpf_inherit(struct cgroup *cgrp)
+{
+/* has to use marco instead of const int, since compiler thinks
+ * that array below is variable length
+ */
+#define NR ARRAY_SIZE(cgrp->bpf.effective)
+ struct bpf_prog_array __rcu *arrays[NR] = {};
+ int i;
+
+ for (i = 0; i < NR; i++)
+ INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
+
+ for (i = 0; i < NR; i++)
+ if (compute_effective_progs(cgrp, i, &arrays[i]))
+ goto cleanup;
+
+ for (i = 0; i < NR; i++)
+ activate_effective_progs(cgrp, i, arrays[i]);
+
+ return 0;
+cleanup:
+ for (i = 0; i < NR; i++)
+ bpf_prog_array_free(arrays[i]);
+ return -ENOMEM;
+}
+
+#define BPF_CGROUP_MAX_PROGS 64
+
+/**
+ * __cgroup_bpf_attach() - Attach the program to a cgroup, and
+ * propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @prog: A program to attach
+ * @type: Type of attach operation
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
+{
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ struct bpf_prog *old_prog = NULL;
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_list *pl;
+ bool pl_was_allocated;
+ u32 old_flags;
+ int err;
+
+ if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+ /* invalid combination */
+ return -EINVAL;
+
+ if (!hierarchy_allows_attach(cgrp, type, flags))
+ return -EPERM;
+
+ if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+ /* Disallow attaching non-overridable on top
+ * of existing overridable in this cgroup.
+ * Disallow attaching multi-prog if overridable or none
+ */
+ return -EPERM;
+
+ if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+ return -E2BIG;
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ list_for_each_entry(pl, progs, node)
+ if (pl->prog == prog)
+ /* disallow attaching the same prog twice */
+ return -EINVAL;
+
+ pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+ if (!pl)
+ return -ENOMEM;
+ pl_was_allocated = true;
+ pl->prog = prog;
+ list_add_tail(&pl->node, progs);
+ } else {
+ if (list_empty(progs)) {
+ pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+ if (!pl)
+ return -ENOMEM;
+ pl_was_allocated = true;
+ list_add_tail(&pl->node, progs);
+ } else {
+ pl = list_first_entry(progs, typeof(*pl), node);
+ old_prog = pl->prog;
+ pl_was_allocated = false;
+ }
+ pl->prog = prog;
+ }
+
+ old_flags = cgrp->bpf.flags[type];
+ cgrp->bpf.flags[type] = flags;
+
+ /* allocate and recompute effective prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+ if (err)
+ goto cleanup;
+ }
+
+ /* all allocations were successful. Activate all prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ activate_effective_progs(desc, type, desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ static_branch_inc(&cgroup_bpf_enabled_key);
+ if (old_prog) {
+ bpf_prog_put(old_prog);
+ static_branch_dec(&cgroup_bpf_enabled_key);
+ }
+ return 0;
+
+cleanup:
+ /* oom while computing effective. Free all computed effective arrays
+ * since they were not activated
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* and cleanup the prog list */
+ pl->prog = old_prog;
+ if (pl_was_allocated) {
+ list_del(&pl->node);
+ kfree(pl);
+ }
+ return err;
+}
+
+/**
+ * __cgroup_bpf_detach() - Detach the program from a cgroup, and
+ * propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @prog: A program to detach or NULL
+ * @type: Type of detach operation
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 unused_flags)
+{
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ u32 flags = cgrp->bpf.flags[type];
+ struct bpf_prog *old_prog = NULL;
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_list *pl;
+ int err;
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ if (!prog)
+ /* to detach MULTI prog the user has to specify valid FD
+ * of the program to be detached
+ */
+ return -EINVAL;
+ } else {
+ if (list_empty(progs))
+ /* report error when trying to detach and nothing is attached */
+ return -ENOENT;
+ }
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ /* find the prog and detach it */
+ list_for_each_entry(pl, progs, node) {
+ if (pl->prog != prog)
+ continue;
+ old_prog = prog;
+ /* mark it deleted, so it's ignored while
+ * recomputing effective
+ */
+ pl->prog = NULL;
+ break;
+ }
+ if (!old_prog)
+ return -ENOENT;
+ } else {
+ /* to maintain backward compatibility NONE and OVERRIDE cgroups
+ * allow detaching with invalid FD (prog==NULL)
+ */
+ pl = list_first_entry(progs, typeof(*pl), node);
+ old_prog = pl->prog;
+ pl->prog = NULL;
+ }
+
+ /* allocate and recompute effective prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+ if (err)
+ goto cleanup;
+ }
+
+ /* all allocations were successful. Activate all prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ activate_effective_progs(desc, type, desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* now can actually delete it from this cgroup list */
+ list_del(&pl->node);
+ kfree(pl);
+ if (list_empty(progs))
+ /* last program was detached, reset flags to zero */
+ cgrp->bpf.flags[type] = 0;
+
+ bpf_prog_put(old_prog);
+ static_branch_dec(&cgroup_bpf_enabled_key);
+ return 0;
+
+cleanup:
+ /* oom while computing effective. Free all computed effective arrays
+ * since they were not activated
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* and restore back old_prog */
+ pl->prog = old_prog;
+ return err;
+}
+
+/**
+ * __cgroup_bpf_run_filter() - Run a program for packet filtering
+ * @sk: The socket sending or receiving traffic
+ * @skb: The skb that is being sent or received
+ * @type: The type of program to be exectuted
+ *
+ * If no socket is passed, or the socket is not of type INET or INET6,
+ * this function does nothing and returns 0.
+ *
+ * The program type passed in via @type must be suitable for network
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter(struct sock *sk,
+ struct sk_buff *skb,
+ enum bpf_attach_type type)
+{
+ unsigned int offset = skb->data - skb_network_header(skb);
+ struct sock *save_sk;
+ struct cgroup *cgrp;
+ int ret;
+
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
+ return 0;
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ save_sk = skb->sk;
+ skb->sk = sk;
+ __skb_push(skb, offset);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+ bpf_prog_run_save_cb);
+ __skb_pull(skb, offset);
+ skb->sk = save_sk;
+ return ret == 1 ? 0 : -EPERM;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter);
+
+/**
+ * __cgroup_bpf_run_filter_sk() - Run a program on a sock
+ * @sk: sock structure to manipulate
+ * @type: The type of program to be exectuted
+ *
+ * socket is passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sk(struct sock *sk,
+ enum bpf_attach_type type)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_prog *prog;
+ int ret = 0;
+
+
+ rcu_read_lock();
+
+ prog = rcu_dereference(cgrp->bpf.effective[type]->progs[0]);
+ if (prog)
+ ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
+
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index eb52d11fdaa7..d5c9dcfe9324 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -27,6 +27,7 @@
#include <linux/random.h>
#include <linux/moduleloader.h>
#include <linux/bpf.h>
+#include <linux/frame.h>
#include <asm/unaligned.h>
@@ -59,11 +60,13 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
{
u8 *ptr = NULL;
- if (k >= SKF_NET_OFF)
+ if (k >= SKF_NET_OFF) {
ptr = skb_network_header(skb) + k - SKF_NET_OFF;
- else if (k >= SKF_LL_OFF)
+ } else if (k >= SKF_LL_OFF) {
+ if (unlikely(!skb_mac_header_was_set(skb)))
+ return NULL;
ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
-
+ }
if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
return ptr;
@@ -104,19 +107,29 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
gfp_extra_flags;
struct bpf_prog *fp;
+ u32 pages, delta;
+ int ret;
BUG_ON(fp_old == NULL);
size = round_up(size, PAGE_SIZE);
- if (size <= fp_old->pages * PAGE_SIZE)
+ pages = size / PAGE_SIZE;
+ if (pages <= fp_old->pages)
return fp_old;
+ delta = pages - fp_old->pages;
+ ret = __bpf_prog_charge(fp_old->aux->user, delta);
+ if (ret)
+ return NULL;
+
fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
- if (fp != NULL) {
+ if (fp == NULL) {
+ __bpf_prog_uncharge(fp_old->aux->user, delta);
+ } else {
kmemcheck_annotate_bitfield(fp, meta);
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
- fp->pages = size / PAGE_SIZE;
+ fp->pages = pages;
fp->aux->prog = fp;
/* We keep fp->aux from fp_old around in the new
@@ -128,14 +141,12 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
return fp;
}
-EXPORT_SYMBOL_GPL(bpf_prog_realloc);
void __bpf_prog_free(struct bpf_prog *fp)
{
kfree(fp->aux);
vfree(fp);
}
-EXPORT_SYMBOL_GPL(__bpf_prog_free);
static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
{
@@ -209,30 +220,94 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
}
#ifdef CONFIG_BPF_JIT
+/* All BPF JIT sysctl knobs here. */
+int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
+int bpf_jit_harden __read_mostly;
+long bpf_jit_limit __read_mostly;
+long bpf_jit_limit_max __read_mostly;
+
+static atomic_long_t bpf_jit_current;
+
+/* Can be overridden by an arch's JIT compiler if it has a custom,
+ * dedicated BPF backend memory area, or if neither of the two
+ * below apply.
+ */
+u64 __weak bpf_jit_alloc_exec_limit(void)
+{
+#if defined(MODULES_VADDR)
+ return MODULES_END - MODULES_VADDR;
+#else
+ return VMALLOC_END - VMALLOC_START;
+#endif
+}
+
+static int __init bpf_jit_charge_init(void)
+{
+ /* Only used as heuristic here to derive limit. */
+ bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
+ bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2,
+ PAGE_SIZE), LONG_MAX);
+ return 0;
+}
+pure_initcall(bpf_jit_charge_init);
+
+static int bpf_jit_charge_modmem(u32 pages)
+{
+ if (atomic_long_add_return(pages, &bpf_jit_current) >
+ (bpf_jit_limit >> PAGE_SHIFT)) {
+ if (!capable(CAP_SYS_ADMIN)) {
+ atomic_long_sub(pages, &bpf_jit_current);
+ return -EPERM;
+ }
+ }
+
+ return 0;
+}
+
+static void bpf_jit_uncharge_modmem(u32 pages)
+{
+ atomic_long_sub(pages, &bpf_jit_current);
+}
+
+#if IS_ENABLED(CONFIG_BPF_JIT) && IS_ENABLED(CONFIG_CFI_CLANG)
+bool __weak arch_bpf_jit_check_func(const struct bpf_prog *prog)
+{
+ return true;
+}
+EXPORT_SYMBOL(arch_bpf_jit_check_func);
+#endif
+
struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
unsigned int alignment,
bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_binary_header *hdr;
- unsigned int size, hole, start;
+ u32 size, hole, start, pages;
/* Most of BPF filters are really small, but if some of them
* fill a page, allow at least 128 extra bytes to insert a
* random section of illegal instructions.
*/
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
+ pages = size / PAGE_SIZE;
+
+ if (bpf_jit_charge_modmem(pages))
+ return NULL;
hdr = module_alloc(size);
- if (hdr == NULL)
+ if (!hdr) {
+ bpf_jit_uncharge_modmem(pages);
return NULL;
+ }
/* Fill space with illegal/arch-dep instructions. */
bpf_fill_ill_insns(hdr, size);
- hdr->pages = size / PAGE_SIZE;
+ bpf_jit_set_header_magic(hdr);
+ hdr->pages = pages;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
- start = (prandom_u32() % hole) & ~(alignment - 1);
+ start = (get_random_int() % hole) & ~(alignment - 1);
/* Leave a random number of instructions before BPF code. */
*image_ptr = &hdr->image[start];
@@ -242,7 +317,215 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
+ u32 pages = hdr->pages;
+
module_memfree(hdr);
+ bpf_jit_uncharge_modmem(pages);
+}
+
+static int bpf_jit_blind_insn(const struct bpf_insn *from,
+ const struct bpf_insn *aux,
+ struct bpf_insn *to_buff)
+{
+ struct bpf_insn *to = to_buff;
+ u32 imm_rnd = get_random_int();
+ s16 off;
+
+ BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
+ BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
+
+ if (from->imm == 0 &&
+ (from->code == (BPF_ALU | BPF_MOV | BPF_K) ||
+ from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
+ *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg);
+ goto out;
+ }
+
+ switch (from->code) {
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_MOV | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX);
+ break;
+
+ case BPF_ALU64 | BPF_ADD | BPF_K:
+ case BPF_ALU64 | BPF_SUB | BPF_K:
+ case BPF_ALU64 | BPF_AND | BPF_K:
+ case BPF_ALU64 | BPF_OR | BPF_K:
+ case BPF_ALU64 | BPF_XOR | BPF_K:
+ case BPF_ALU64 | BPF_MUL | BPF_K:
+ case BPF_ALU64 | BPF_MOV | BPF_K:
+ case BPF_ALU64 | BPF_DIV | BPF_K:
+ case BPF_ALU64 | BPF_MOD | BPF_K:
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX);
+ break;
+
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JNE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JLT | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JLE | BPF_K:
+ case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSLT | BPF_K:
+ case BPF_JMP | BPF_JSGE | BPF_K:
+ case BPF_JMP | BPF_JSLE | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ /* Accommodate for extra offset in case of a backjump. */
+ off = from->off;
+ if (off < 0)
+ off -= 2;
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
+ break;
+
+ case BPF_LD | BPF_ABS | BPF_W:
+ case BPF_LD | BPF_ABS | BPF_H:
+ case BPF_LD | BPF_ABS | BPF_B:
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
+ break;
+
+ case BPF_LD | BPF_IND | BPF_W:
+ case BPF_LD | BPF_IND | BPF_H:
+ case BPF_LD | BPF_IND | BPF_B:
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg);
+ *to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
+ break;
+
+ case BPF_LD | BPF_IMM | BPF_DW:
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
+ *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX);
+ break;
+ case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
+ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
+ *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX);
+ break;
+
+ case BPF_ST | BPF_MEM | BPF_DW:
+ case BPF_ST | BPF_MEM | BPF_W:
+ case BPF_ST | BPF_MEM | BPF_H:
+ case BPF_ST | BPF_MEM | BPF_B:
+ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
+ break;
+ }
+out:
+ return to - to_buff;
+}
+
+static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
+ gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+ gfp_extra_flags;
+ struct bpf_prog *fp;
+
+ fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
+ if (fp != NULL) {
+ kmemcheck_annotate_bitfield(fp, meta);
+
+ /* aux->prog still points to the fp_other one, so
+ * when promoting the clone to the real program,
+ * this still needs to be adapted.
+ */
+ memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE);
+ }
+
+ return fp;
+}
+
+static void bpf_prog_clone_free(struct bpf_prog *fp)
+{
+ /* aux was stolen by the other clone, so we cannot free
+ * it from this path! It will be freed eventually by the
+ * other program on release.
+ *
+ * At this point, we don't need a deferred release since
+ * clone is guaranteed to not be locked.
+ */
+ fp->aux = NULL;
+ __bpf_prog_free(fp);
+}
+
+void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
+{
+ /* We have to repoint aux->prog to self, as we don't
+ * know whether fp here is the clone or the original.
+ */
+ fp->aux->prog = fp;
+ bpf_prog_clone_free(fp_other);
+}
+
+struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
+{
+ struct bpf_insn insn_buff[16], aux[2];
+ struct bpf_prog *clone, *tmp;
+ int insn_delta, insn_cnt;
+ struct bpf_insn *insn;
+ int i, rewritten;
+
+ if (!bpf_jit_blinding_enabled())
+ return prog;
+
+ clone = bpf_prog_clone_create(prog, GFP_USER);
+ if (!clone)
+ return ERR_PTR(-ENOMEM);
+
+ insn_cnt = clone->len;
+ insn = clone->insnsi;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ /* We temporarily need to hold the original ld64 insn
+ * so that we can still access the first part in the
+ * second blinding run.
+ */
+ if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+ insn[1].code == 0)
+ memcpy(aux, insn, sizeof(aux));
+
+ rewritten = bpf_jit_blind_insn(insn, aux, insn_buff);
+ if (!rewritten)
+ continue;
+
+ tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
+ if (!tmp) {
+ /* Patching may have repointed aux->prog during
+ * realloc from the original one, so we need to
+ * fix it up here on error.
+ */
+ bpf_jit_prog_release_other(prog, clone);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ clone = tmp;
+ insn_delta = rewritten - 1;
+
+ /* Walk new program and skip insns we just inserted. */
+ insn = clone->insnsi + i + insn_delta;
+ insn_cnt += insn_delta;
+ i += insn_delta;
+ }
+
+ return clone;
}
#endif /* CONFIG_BPF_JIT */
@@ -264,7 +547,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
*
* Decode and execute eBPF instructions.
*/
-static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
+static unsigned int __bpf_prog_run(const struct sk_buff *ctx, const struct bpf_insn *insn)
{
u64 stack[MAX_BPF_STACK / sizeof(u64)];
u64 regs[MAX_BPF_REG], tmp;
@@ -325,7 +608,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
/* Call instruction */
[BPF_JMP | BPF_CALL] = &&JMP_CALL,
- [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
+ [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
/* Jumps */
[BPF_JMP | BPF_JA] = &&JMP_JA,
[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -334,12 +617,20 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
[BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
[BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
[BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+ [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X,
+ [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K,
[BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
[BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+ [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X,
+ [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K,
[BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
[BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+ [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X,
+ [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K,
[BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
[BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+ [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X,
+ [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K,
[BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
[BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
/* Program return */
@@ -378,10 +669,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
ARG1 = (u64) (unsigned long) ctx;
- /* Registers used in classic BPF programs need to be reset first. */
- regs[BPF_REG_A] = 0;
- regs[BPF_REG_X] = 0;
-
select_insn:
goto *jumptable[insn->code];
@@ -522,14 +809,13 @@ select_insn:
if (unlikely(index >= array->map.max_entries))
goto out;
-
if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
goto out;
tail_call_cnt++;
prog = READ_ONCE(array->ptrs[index]);
- if (unlikely(!prog))
+ if (!prog)
goto out;
/* ARG1 at this point is guaranteed to point to CTX from
@@ -582,6 +868,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JLT_X:
+ if (DST < SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JLT_K:
+ if (DST < IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JGE_X:
if (DST >= SRC) {
insn += insn->off;
@@ -594,6 +892,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JLE_X:
+ if (DST <= SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JLE_K:
+ if (DST <= IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSGT_X:
if (((s64) DST) > ((s64) SRC)) {
insn += insn->off;
@@ -606,6 +916,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JSLT_X:
+ if (((s64) DST) < ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSLT_K:
+ if (((s64) DST) < ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSGE_X:
if (((s64) DST) >= ((s64) SRC)) {
insn += insn->off;
@@ -618,6 +940,18 @@ out:
CONT_JMP;
}
CONT;
+ JMP_JSLE_X:
+ if (((s64) DST) <= ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSLE_K:
+ if (((s64) DST) <= ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
JMP_JSET_X:
if (DST & SRC) {
insn += insn->off;
@@ -725,10 +1059,16 @@ load_byte:
WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
return 0;
}
+STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */
#else
-static unsigned int __bpf_prog_ret0(void *ctx, const struct bpf_insn *insn)
+static unsigned int __bpf_prog_ret0_warn(void *ctx,
+ const struct bpf_insn *insn)
{
+ /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON
+ * is not working properly, so warn about it!
+ */
+ WARN_ON_ONCE(1);
return 0;
}
#endif
@@ -773,16 +1113,17 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
/**
* bpf_prog_select_runtime - select exec runtime for BPF program
* @fp: bpf_prog populated with internal BPF program
+ * @err: pointer to error variable
*
* Try to JIT eBPF program, if JIT is not available, use interpreter.
* The BPF program will be executed via BPF_PROG_RUN() macro.
*/
-int bpf_prog_select_runtime(struct bpf_prog *fp)
+struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
fp->bpf_func = (void *) __bpf_prog_run;
#else
- fp->bpf_func = (void *) __bpf_prog_ret0;
+ fp->bpf_func = (void *) __bpf_prog_ret0_warn;
#endif
/* eBPF JITs can rewrite the program in case constant
@@ -791,10 +1132,12 @@ int bpf_prog_select_runtime(struct bpf_prog *fp)
* valid program, which in this case would simply not
* be JITed, but falls back to the interpreter.
*/
- bpf_int_jit_compile(fp);
+ fp = bpf_int_jit_compile(fp);
#ifdef CONFIG_BPF_JIT_ALWAYS_ON
- if (!fp->jited)
- return -ENOTSUPP;
+ if (!fp->jited) {
+ *err = -ENOTSUPP;
+ return fp;
+ }
#endif
bpf_prog_lock_ro(fp);
@@ -803,10 +1146,124 @@ int bpf_prog_select_runtime(struct bpf_prog *fp)
* with JITed or non JITed program concatenations and not
* all eBPF JITs might immediately support all features.
*/
- return bpf_check_tail_call(fp);
+ *err = bpf_check_tail_call(fp);
+
+ return fp;
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
+static unsigned int __bpf_prog_ret1(const struct sk_buff *ctx,
+ const struct bpf_insn *insn)
+{
+ return 1;
+}
+
+static struct bpf_prog_dummy {
+ struct bpf_prog prog;
+} dummy_bpf_prog = {
+ .prog = {
+ .bpf_func = __bpf_prog_ret1,
+ },
+};
+
+/* to avoid allocating empty bpf_prog_array for cgroups that
+ * don't have bpf program attached use one global 'empty_prog_array'
+ * It will not be modified the caller of bpf_prog_array_alloc()
+ * (since caller requested prog_cnt == 0)
+ * that pointer should be 'freed' by bpf_prog_array_free()
+ */
+static struct {
+ struct bpf_prog_array hdr;
+ struct bpf_prog *null_prog;
+} empty_prog_array = {
+ .null_prog = NULL,
+};
+
+struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
+{
+ if (prog_cnt)
+ return kzalloc(sizeof(struct bpf_prog_array) +
+ sizeof(struct bpf_prog *) * (prog_cnt + 1),
+ flags);
+
+ return &empty_prog_array.hdr;
+}
+
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+{
+ if (!progs ||
+ progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+ return;
+ kfree_rcu(progs, rcu);
+}
+
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_prog **prog = progs->progs;
+
+ for (; *prog; prog++)
+ if (*prog == old_prog) {
+ WRITE_ONCE(*prog, &dummy_bpf_prog.prog);
+ break;
+ }
+}
+
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+ struct bpf_prog *exclude_prog,
+ struct bpf_prog *include_prog,
+ struct bpf_prog_array **new_array)
+{
+ int new_prog_cnt, carry_prog_cnt = 0;
+ struct bpf_prog **existing_prog;
+ struct bpf_prog_array *array;
+ int new_prog_idx = 0;
+
+ /* Figure out how many existing progs we need to carry over to
+ * the new array.
+ */
+ if (old_array) {
+ existing_prog = old_array->progs;
+ for (; *existing_prog; existing_prog++) {
+ if (*existing_prog != exclude_prog &&
+ *existing_prog != &dummy_bpf_prog.prog)
+ carry_prog_cnt++;
+ if (*existing_prog == include_prog)
+ return -EEXIST;
+ }
+ }
+
+ /* How many progs (not NULL) will be in the new array? */
+ new_prog_cnt = carry_prog_cnt;
+ if (include_prog)
+ new_prog_cnt += 1;
+
+ /* Do we have any prog (not NULL) in the new array? */
+ if (!new_prog_cnt) {
+ *new_array = NULL;
+ return 0;
+ }
+
+ /* +1 as the end of prog_array is marked with NULL */
+ array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
+ if (!array)
+ return -ENOMEM;
+
+ /* Fill in the new prog array */
+ if (carry_prog_cnt) {
+ existing_prog = old_array->progs;
+ for (; *existing_prog; existing_prog++)
+ if (*existing_prog != exclude_prog &&
+ *existing_prog != &dummy_bpf_prog.prog)
+ array->progs[new_prog_idx++] = *existing_prog;
+ }
+ if (include_prog)
+ array->progs[new_prog_idx++] = include_prog;
+ array->progs[new_prog_idx] = NULL;
+ *new_array = array;
+ return 0;
+}
+
static void bpf_prog_free_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
@@ -833,7 +1290,7 @@ void bpf_user_rnd_init_once(void)
prandom_init_once(&bpf_user_rnd_state);
}
-u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_user_rnd_u32)
{
/* Should someone ever have the rather unwise idea to use some
* of the registers passed into this function, then note that
@@ -846,7 +1303,7 @@ u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
state = &get_cpu_var(bpf_user_rnd_state);
res = prandom_u32_state(state);
- put_cpu_var(state);
+ put_cpu_var(bpf_user_rnd_state);
return res;
}
@@ -859,14 +1316,23 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
return NULL;
}
+u64 __weak
+bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
+ void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
+{
+ return -ENOTSUPP;
+}
+
/* Always built-in helper functions. */
const struct bpf_func_proto bpf_tail_call_proto = {
.func = NULL,
@@ -877,9 +1343,25 @@ const struct bpf_func_proto bpf_tail_call_proto = {
.arg3_type = ARG_ANYTHING,
};
-/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
-void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+/* Stub for JITs that only support cBPF. eBPF programs are interpreted.
+ * It is encouraged to implement bpf_int_jit_compile() instead, so that
+ * eBPF and implicitly also cBPF can get JITed!
+ */
+struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
+{
+ return prog;
+}
+
+/* Stub for JITs that support eBPF. All cBPF code gets transformed into
+ * eBPF by the kernel and is later compiled by bpf_int_jit_compile().
+ */
+void __weak bpf_jit_compile(struct bpf_prog *prog)
+{
+}
+
+bool __weak bpf_helper_changes_skb_data(void *func)
{
+ return false;
}
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index a35abe048239..998f7466e69d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1,4 +1,5 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -12,39 +13,170 @@
#include <linux/bpf.h>
#include <linux/jhash.h>
#include <linux/filter.h>
-#include <linux/vmalloc.h>
+#include <linux/rculist_nulls.h>
+#include "percpu_freelist.h"
+#define HTAB_CREATE_FLAG_MASK \
+ (BPF_F_NO_PREALLOC | BPF_F_RDONLY | BPF_F_WRONLY)
+
+struct bucket {
+ struct hlist_nulls_head head;
+ raw_spinlock_t lock;
+};
struct bpf_htab {
struct bpf_map map;
- struct hlist_head *buckets;
- raw_spinlock_t lock;
- u32 count; /* number of elements in this hashtable */
+ struct bucket *buckets;
+ void *elems;
+ struct pcpu_freelist freelist;
+ void __percpu *extra_elems;
+ atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
};
+enum extra_elem_state {
+ HTAB_NOT_AN_EXTRA_ELEM = 0,
+ HTAB_EXTRA_ELEM_FREE,
+ HTAB_EXTRA_ELEM_USED
+};
+
/* each htab element is struct htab_elem + key + value */
struct htab_elem {
- struct hlist_node hash_node;
- struct rcu_head rcu;
+ union {
+ struct hlist_nulls_node hash_node;
+ struct {
+ void *padding;
+ union {
+ struct bpf_htab *htab;
+ struct pcpu_freelist_node fnode;
+ };
+ };
+ };
+ union {
+ struct rcu_head rcu;
+ enum extra_elem_state state;
+ };
u32 hash;
char key[0] __aligned(8);
};
+static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
+ void __percpu *pptr)
+{
+ *(void __percpu **)(l->key + key_size) = pptr;
+}
+
+static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
+{
+ return *(void __percpu **)(l->key + key_size);
+}
+
+static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
+{
+ return (struct htab_elem *) (htab->elems + i * htab->elem_size);
+}
+
+static void htab_free_elems(struct bpf_htab *htab)
+{
+ int i;
+
+ if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+ goto free_elems;
+
+ for (i = 0; i < htab->map.max_entries; i++) {
+ void __percpu *pptr;
+
+ pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
+ htab->map.key_size);
+ free_percpu(pptr);
+ }
+free_elems:
+ bpf_map_area_free(htab->elems);
+}
+
+static int prealloc_elems_and_freelist(struct bpf_htab *htab)
+{
+ int err = -ENOMEM, i;
+
+ htab->elems = bpf_map_area_alloc(htab->elem_size *
+ htab->map.max_entries);
+ if (!htab->elems)
+ return -ENOMEM;
+
+ if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+ goto skip_percpu_elems;
+
+ for (i = 0; i < htab->map.max_entries; i++) {
+ u32 size = round_up(htab->map.value_size, 8);
+ void __percpu *pptr;
+
+ pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
+ if (!pptr)
+ goto free_elems;
+ htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
+ pptr);
+ }
+
+skip_percpu_elems:
+ err = pcpu_freelist_init(&htab->freelist);
+ if (err)
+ goto free_elems;
+
+ pcpu_freelist_populate(&htab->freelist,
+ htab->elems + offsetof(struct htab_elem, fnode),
+ htab->elem_size, htab->map.max_entries);
+
+ return 0;
+
+free_elems:
+ htab_free_elems(htab);
+ return err;
+}
+
+static int alloc_extra_elems(struct bpf_htab *htab)
+{
+ void __percpu *pptr;
+ int cpu;
+
+ pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
+ if (!pptr)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
+ HTAB_EXTRA_ELEM_FREE;
+ }
+ htab->extra_elems = pptr;
+ return 0;
+}
+
/* Called from syscall */
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{
+ bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH;
struct bpf_htab *htab;
int err, i;
+ u64 cost;
+
+ BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
+ offsetof(struct htab_elem, hash_node.pprev));
+ BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
+ offsetof(struct htab_elem, hash_node.pprev));
+
+ if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
+ /* reserved bits should not be used */
+ return ERR_PTR(-EINVAL);
htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab)
return ERR_PTR(-ENOMEM);
/* mandatory map attributes */
+ htab->map.map_type = attr->map_type;
htab->map.key_size = attr->key_size;
htab->map.value_size = attr->value_size;
htab->map.max_entries = attr->max_entries;
+ htab->map.map_flags = attr->map_flags;
/* check sanity of attributes.
* value_size == 0 may be allowed in the future to use map as a set
@@ -73,43 +205,71 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
*/
goto free_htab;
+ if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
+ /* make sure the size for pcpu_alloc() is reasonable */
+ goto free_htab;
+
htab->elem_size = sizeof(struct htab_elem) +
- round_up(htab->map.key_size, 8) +
- htab->map.value_size;
+ round_up(htab->map.key_size, 8);
+ if (percpu)
+ htab->elem_size += sizeof(void *);
+ else
+ htab->elem_size += round_up(htab->map.value_size, 8);
/* prevent zero size kmalloc and check for u32 overflow */
if (htab->n_buckets == 0 ||
- htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
+ htab->n_buckets > U32_MAX / sizeof(struct bucket))
goto free_htab;
- if ((u64) htab->n_buckets * sizeof(struct hlist_head) +
- (u64) htab->elem_size * htab->map.max_entries >=
- U32_MAX - PAGE_SIZE)
+ cost = (u64) htab->n_buckets * sizeof(struct bucket) +
+ (u64) htab->elem_size * htab->map.max_entries;
+
+ if (percpu)
+ cost += (u64) round_up(htab->map.value_size, 8) *
+ num_possible_cpus() * htab->map.max_entries;
+ else
+ cost += (u64) htab->elem_size * num_possible_cpus();
+
+ if (cost >= U32_MAX - PAGE_SIZE)
/* make sure page count doesn't overflow */
goto free_htab;
- htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
- htab->elem_size * htab->map.max_entries,
- PAGE_SIZE) >> PAGE_SHIFT;
+ htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ /* if map size is larger than memlock limit, reject it early */
+ err = bpf_map_precharge_memlock(htab->map.pages);
+ if (err)
+ goto free_htab;
err = -ENOMEM;
- htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
- GFP_USER | __GFP_NOWARN);
+ htab->buckets = bpf_map_area_alloc(htab->n_buckets *
+ sizeof(struct bucket));
+ if (!htab->buckets)
+ goto free_htab;
- if (!htab->buckets) {
- htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
- if (!htab->buckets)
- goto free_htab;
+ for (i = 0; i < htab->n_buckets; i++) {
+ INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
+ raw_spin_lock_init(&htab->buckets[i].lock);
}
- for (i = 0; i < htab->n_buckets; i++)
- INIT_HLIST_HEAD(&htab->buckets[i]);
+ if (!percpu) {
+ err = alloc_extra_elems(htab);
+ if (err)
+ goto free_buckets;
+ }
- raw_spin_lock_init(&htab->lock);
- htab->count = 0;
+ if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
+ err = prealloc_elems_and_freelist(htab);
+ if (err)
+ goto free_extra_elems;
+ }
return &htab->map;
+free_extra_elems:
+ free_percpu(htab->extra_elems);
+free_buckets:
+ bpf_map_area_free(htab->buckets);
free_htab:
kfree(htab);
return ERR_PTR(err);
@@ -120,28 +280,57 @@ static inline u32 htab_map_hash(const void *key, u32 key_len)
return jhash(key, key_len, 0);
}
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
{
return &htab->buckets[hash & (htab->n_buckets - 1)];
}
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
+static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &__select_bucket(htab, hash)->head;
+}
+
+/* this lookup function can only be called with bucket lock taken */
+static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash,
void *key, u32 key_size)
{
+ struct hlist_nulls_node *n;
struct htab_elem *l;
- hlist_for_each_entry_rcu(l, head, hash_node)
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l->hash == hash && !memcmp(&l->key, key, key_size))
return l;
return NULL;
}
+/* can be called without bucket lock. it will repeat the loop in
+ * the unlikely event when elements moved from one bucket into another
+ * while link list is being walked
+ */
+static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head,
+ u32 hash, void *key,
+ u32 key_size, u32 n_buckets)
+{
+ struct hlist_nulls_node *n;
+ struct htab_elem *l;
+
+again:
+ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
+ if (l->hash == hash && !memcmp(&l->key, key, key_size))
+ return l;
+
+ if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1))))
+ goto again;
+
+ return NULL;
+}
+
/* Called from syscall or from eBPF program */
-static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
struct htab_elem *l;
u32 hash, key_size;
@@ -154,7 +343,14 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
head = select_bucket(htab, hash);
- l = lookup_elem_raw(head, hash, key, key_size);
+ l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
+
+ return l;
+}
+
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct htab_elem *l = __htab_map_lookup_elem(map, key);
if (l)
return l->key + round_up(map->key_size, 8);
@@ -166,7 +362,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
struct htab_elem *l, *next_l;
u32 hash, key_size;
int i = 0;
@@ -183,13 +379,13 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
head = select_bucket(htab, hash);
/* lookup the key */
- l = lookup_elem_raw(head, hash, key, key_size);
+ l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets);
if (!l)
goto find_first_elem;
/* key was found, get next key in the same bucket */
- next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
+ next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)),
struct htab_elem, hash_node);
if (next_l) {
@@ -208,7 +404,7 @@ find_first_elem:
head = select_bucket(htab, i);
/* pick first element in the bucket */
- next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+ next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)),
struct htab_elem, hash_node);
if (next_l) {
/* if it's not empty, just return it */
@@ -217,90 +413,274 @@ find_first_elem:
}
}
- /* itereated over all buckets and all elements */
+ /* iterated over all buckets and all elements */
return -ENOENT;
}
+static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
+{
+ if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
+ free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
+ kfree(l);
+}
+
+static void htab_elem_free_rcu(struct rcu_head *head)
+{
+ struct htab_elem *l = container_of(head, struct htab_elem, rcu);
+ struct bpf_htab *htab = l->htab;
+
+ htab_elem_free(htab, l);
+}
+
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+{
+ if (l->state == HTAB_EXTRA_ELEM_USED) {
+ l->state = HTAB_EXTRA_ELEM_FREE;
+ return;
+ }
+
+ if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
+ pcpu_freelist_push(&htab->freelist, &l->fnode);
+ } else {
+ atomic_dec(&htab->count);
+ l->htab = htab;
+ call_rcu(&l->rcu, htab_elem_free_rcu);
+ }
+}
+
+static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
+ void *value, u32 key_size, u32 hash,
+ bool percpu, bool onallcpus,
+ bool old_elem_exists)
+{
+ u32 size = htab->map.value_size;
+ bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
+ struct htab_elem *l_new;
+ void __percpu *pptr;
+ int err = 0;
+
+ if (prealloc) {
+ struct pcpu_freelist_node *l;
+
+ l = pcpu_freelist_pop(&htab->freelist);
+ if (!l)
+ err = -E2BIG;
+ else
+ l_new = container_of(l, struct htab_elem, fnode);
+ } else {
+ if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+ atomic_dec(&htab->count);
+ err = -E2BIG;
+ } else {
+ l_new = kmalloc(htab->elem_size,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!l_new)
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+ if (err) {
+ if (!old_elem_exists)
+ return ERR_PTR(err);
+
+ /* if we're updating the existing element and the hash table
+ * is full, use per-cpu extra elems
+ */
+ l_new = this_cpu_ptr(htab->extra_elems);
+ if (l_new->state != HTAB_EXTRA_ELEM_FREE)
+ return ERR_PTR(-E2BIG);
+ l_new->state = HTAB_EXTRA_ELEM_USED;
+ } else {
+ l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
+ }
+
+ memcpy(l_new->key, key, key_size);
+ if (percpu) {
+ /* round up value_size to 8 bytes */
+ size = round_up(size, 8);
+
+ if (prealloc) {
+ pptr = htab_elem_get_ptr(l_new, key_size);
+ } else {
+ /* alloc_percpu zero-fills */
+ pptr = __alloc_percpu_gfp(size, 8,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!pptr) {
+ kfree(l_new);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+ if (!onallcpus) {
+ /* copy true value_size bytes */
+ memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+ } else {
+ int off = 0, cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
+ value + off, size);
+ off += size;
+ }
+ }
+ if (!prealloc)
+ htab_elem_set_ptr(l_new, key_size, pptr);
+ } else {
+ memcpy(l_new->key + round_up(key_size, 8), value, size);
+ }
+
+ l_new->hash = hash;
+ return l_new;
+}
+
+static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
+ u64 map_flags)
+{
+ if (l_old && map_flags == BPF_NOEXIST)
+ /* elem already exists */
+ return -EEXIST;
+
+ if (!l_old && map_flags == BPF_EXIST)
+ /* elem doesn't exist, cannot update it */
+ return -ENOENT;
+
+ return 0;
+}
+
/* Called from syscall or from eBPF program */
static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct htab_elem *l_new, *l_old;
- struct hlist_head *head;
+ struct htab_elem *l_new = NULL, *l_old;
+ struct hlist_nulls_head *head;
unsigned long flags;
- u32 key_size;
+ struct bucket *b;
+ u32 key_size, hash;
int ret;
- if (map_flags > BPF_EXIST)
+ if (unlikely(map_flags > BPF_EXIST))
/* unknown flags */
return -EINVAL;
WARN_ON_ONCE(!rcu_read_lock_held());
- /* allocate new element outside of lock */
- l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
- if (!l_new)
- return -ENOMEM;
-
key_size = map->key_size;
- memcpy(l_new->key, key, key_size);
- memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+ hash = htab_map_hash(key, key_size);
- l_new->hash = htab_map_hash(l_new->key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
/* bpf_map_update_elem() can be called in_irq() */
- raw_spin_lock_irqsave(&htab->lock, flags);
+ raw_spin_lock_irqsave(&b->lock, flags);
- head = select_bucket(htab, l_new->hash);
+ l_old = lookup_elem_raw(head, hash, key, key_size);
- l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
+ goto err;
- if (!l_old && unlikely(htab->count >= map->max_entries)) {
- /* if elem with this 'key' doesn't exist and we've reached
- * max_entries limit, fail insertion of new elem
- */
- ret = -E2BIG;
+ l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
+ !!l_old);
+ if (IS_ERR(l_new)) {
+ /* all pre-allocated elements are in use or memory exhausted */
+ ret = PTR_ERR(l_new);
goto err;
}
- if (l_old && map_flags == BPF_NOEXIST) {
- /* elem already exists */
- ret = -EEXIST;
- goto err;
+ /* add new element to the head of the list, so that
+ * concurrent search will find it before old elem
+ */
+ hlist_nulls_add_head_rcu(&l_new->hash_node, head);
+ if (l_old) {
+ hlist_nulls_del_rcu(&l_old->hash_node);
+ free_htab_elem(htab, l_old);
}
+ ret = 0;
+err:
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ return ret;
+}
- if (!l_old && map_flags == BPF_EXIST) {
- /* elem doesn't exist, cannot update it */
- ret = -ENOENT;
+static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem *l_new = NULL, *l_old;
+ struct hlist_nulls_head *head;
+ unsigned long flags;
+ struct bucket *b;
+ u32 key_size, hash;
+ int ret;
+
+ if (unlikely(map_flags > BPF_EXIST))
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ l_old = lookup_elem_raw(head, hash, key, key_size);
+
+ ret = check_flags(htab, l_old, map_flags);
+ if (ret)
goto err;
- }
- /* add new element to the head of the list, so that concurrent
- * search will find it before old elem
- */
- hlist_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
- hlist_del_rcu(&l_old->hash_node);
- kfree_rcu(l_old, rcu);
+ void __percpu *pptr = htab_elem_get_ptr(l_old, key_size);
+ u32 size = htab->map.value_size;
+
+ /* per-cpu hash map can update value in-place */
+ if (!onallcpus) {
+ memcpy(this_cpu_ptr(pptr), value, size);
+ } else {
+ int off = 0, cpu;
+
+ size = round_up(size, 8);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
+ value + off, size);
+ off += size;
+ }
+ }
} else {
- htab->count++;
+ l_new = alloc_htab_elem(htab, key, value, key_size,
+ hash, true, onallcpus, false);
+ if (IS_ERR(l_new)) {
+ ret = PTR_ERR(l_new);
+ goto err;
+ }
+ hlist_nulls_add_head_rcu(&l_new->hash_node, head);
}
- raw_spin_unlock_irqrestore(&htab->lock, flags);
-
- return 0;
+ ret = 0;
err:
- raw_spin_unlock_irqrestore(&htab->lock, flags);
- kfree(l_new);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
return ret;
}
+static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
+}
+
/* Called from syscall or from eBPF program */
static int htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- struct hlist_head *head;
+ struct hlist_nulls_head *head;
+ struct bucket *b;
struct htab_elem *l;
unsigned long flags;
u32 hash, key_size;
@@ -311,21 +691,20 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
key_size = map->key_size;
hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
- raw_spin_lock_irqsave(&htab->lock, flags);
-
- head = select_bucket(htab, hash);
+ raw_spin_lock_irqsave(&b->lock, flags);
l = lookup_elem_raw(head, hash, key, key_size);
if (l) {
- hlist_del_rcu(&l->hash_node);
- htab->count--;
- kfree_rcu(l, rcu);
+ hlist_nulls_del_rcu(&l->hash_node);
+ free_htab_elem(htab, l);
ret = 0;
}
- raw_spin_unlock_irqrestore(&htab->lock, flags);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
return ret;
}
@@ -334,18 +713,17 @@ static void delete_all_elements(struct bpf_htab *htab)
int i;
for (i = 0; i < htab->n_buckets; i++) {
- struct hlist_head *head = select_bucket(htab, i);
- struct hlist_node *n;
+ struct hlist_nulls_head *head = select_bucket(htab, i);
+ struct hlist_nulls_node *n;
struct htab_elem *l;
- hlist_for_each_entry_safe(l, n, head, hash_node) {
- hlist_del_rcu(&l->hash_node);
- htab->count--;
- kfree(l);
+ hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
+ hlist_nulls_del_rcu(&l->hash_node);
+ if (l->state != HTAB_EXTRA_ELEM_USED)
+ htab_elem_free(htab, l);
}
}
}
-
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void htab_map_free(struct bpf_map *map)
{
@@ -358,11 +736,18 @@ static void htab_map_free(struct bpf_map *map)
*/
synchronize_rcu();
- /* some of kfree_rcu() callbacks for elements of this map may not have
- * executed. It's ok. Proceed to free residual elements and map itself
+ /* some of free_htab_elem() callbacks for elements of this map may
+ * not have executed. Wait for them.
*/
- delete_all_elements(htab);
- kvfree(htab->buckets);
+ rcu_barrier();
+ if (htab->map.map_flags & BPF_F_NO_PREALLOC) {
+ delete_all_elements(htab);
+ } else {
+ htab_free_elems(htab);
+ pcpu_freelist_destroy(&htab->freelist);
+ }
+ free_percpu(htab->extra_elems);
+ bpf_map_area_free(htab->buckets);
kfree(htab);
}
@@ -380,9 +765,76 @@ static struct bpf_map_type_list htab_type __read_mostly = {
.type = BPF_MAP_TYPE_HASH,
};
+/* Called from eBPF program */
+static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
+ if (l)
+ return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
+ else
+ return NULL;
+}
+
+int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
+{
+ struct htab_elem *l;
+ void __percpu *pptr;
+ int ret = -ENOENT;
+ int cpu, off = 0;
+ u32 size;
+
+ /* per_cpu areas are zero-filled and bpf programs can only
+ * access 'value_size' of them, so copying rounded areas
+ * will not leak any kernel data
+ */
+ size = round_up(map->value_size, 8);
+ rcu_read_lock();
+ l = __htab_map_lookup_elem(map, key);
+ if (!l)
+ goto out;
+ pptr = htab_elem_get_ptr(l, map->key_size);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(value + off,
+ per_cpu_ptr(pptr, cpu), size);
+ off += size;
+ }
+ ret = 0;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ int ret;
+
+ rcu_read_lock();
+ ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static const struct bpf_map_ops htab_percpu_ops = {
+ .map_alloc = htab_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_percpu_map_lookup_elem,
+ .map_update_elem = htab_percpu_map_update_elem,
+ .map_delete_elem = htab_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_percpu_type __read_mostly = {
+ .ops = &htab_percpu_ops,
+ .type = BPF_MAP_TYPE_PERCPU_HASH,
+};
+
static int __init register_htab_map(void)
{
bpf_register_map_type(&htab_type);
+ bpf_register_map_type(&htab_percpu_type);
return 0;
}
late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 50da680c479f..39918402e6e9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -16,6 +16,7 @@
#include <linux/ktime.h>
#include <linux/sched.h>
#include <linux/uidgid.h>
+#include <linux/filter.h>
/* If kernel subsystem is allowing eBPF programs to call this function,
* inside its own verifier_ops->get_func_proto() callback it should return
@@ -26,48 +27,32 @@
* if program is allowed to access maps, so check rcu_read_lock_held in
* all three functions.
*/
-static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
{
- /* verifier checked that R1 contains a valid pointer to bpf_map
- * and R2 points to a program stack and map->key_size bytes were
- * initialized
- */
- struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
- void *key = (void *) (unsigned long) r2;
- void *value;
-
WARN_ON_ONCE(!rcu_read_lock_held());
-
- value = map->ops->map_lookup_elem(map, key);
-
- /* lookup() returns either pointer to element value or NULL
- * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
- */
- return (unsigned long) value;
+ return (unsigned long) map->ops->map_lookup_elem(map, key);
}
const struct bpf_func_proto bpf_map_lookup_elem_proto = {
.func = bpf_map_lookup_elem,
.gpl_only = false,
+ .pkt_access = true,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_MAP_KEY,
};
-static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
+ void *, value, u64, flags)
{
- struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
- void *key = (void *) (unsigned long) r2;
- void *value = (void *) (unsigned long) r3;
-
WARN_ON_ONCE(!rcu_read_lock_held());
-
- return map->ops->map_update_elem(map, key, value, r4);
+ return map->ops->map_update_elem(map, key, value, flags);
}
const struct bpf_func_proto bpf_map_update_elem_proto = {
.func = bpf_map_update_elem,
.gpl_only = false,
+ .pkt_access = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_MAP_KEY,
@@ -75,19 +60,16 @@ const struct bpf_func_proto bpf_map_update_elem_proto = {
.arg4_type = ARG_ANYTHING,
};
-static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
{
- struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
- void *key = (void *) (unsigned long) r2;
-
WARN_ON_ONCE(!rcu_read_lock_held());
-
return map->ops->map_delete_elem(map, key);
}
const struct bpf_func_proto bpf_map_delete_elem_proto = {
.func = bpf_map_delete_elem,
.gpl_only = false,
+ .pkt_access = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_MAP_KEY,
@@ -99,9 +81,9 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = {
.ret_type = RET_INTEGER,
};
-static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_smp_processor_id)
{
- return raw_smp_processor_id();
+ return smp_processor_id();
}
const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
@@ -110,7 +92,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
.ret_type = RET_INTEGER,
};
-static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_ktime_get_ns)
{
/* NMI safe access to clock monotonic */
return ktime_get_mono_fast_ns();
@@ -122,11 +104,11 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto = {
.ret_type = RET_INTEGER,
};
-static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_current_pid_tgid)
{
struct task_struct *task = current;
- if (!task)
+ if (unlikely(!task))
return -EINVAL;
return (u64) task->tgid << 32 | task->pid;
@@ -138,18 +120,18 @@ const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
.ret_type = RET_INTEGER,
};
-static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+BPF_CALL_0(bpf_get_current_uid_gid)
{
struct task_struct *task = current;
kuid_t uid;
kgid_t gid;
- if (!task)
+ if (unlikely(!task))
return -EINVAL;
current_uid_gid(&uid, &gid);
return (u64) from_kgid(&init_user_ns, gid) << 32 |
- from_kuid(&init_user_ns, uid);
+ from_kuid(&init_user_ns, uid);
}
const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
@@ -158,22 +140,30 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
.ret_type = RET_INTEGER,
};
-static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
+BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
{
struct task_struct *task = current;
- char *buf = (char *) (long) r1;
- if (!task)
- return -EINVAL;
+ if (unlikely(!task))
+ goto err_clear;
+
+ strncpy(buf, task->comm, size);
- strlcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm)));
+ /* Verifier guarantees that size > 0. For task->comm exceeding
+ * size, guarantee that buf is %NUL-terminated. Unconditionally
+ * done here to save the size test.
+ */
+ buf[size - 1] = 0;
return 0;
+err_clear:
+ memset(buf, 0, size);
+ return -EINVAL;
}
const struct bpf_func_proto bpf_get_current_comm_proto = {
.func = bpf_get_current_comm,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_STACK,
+ .arg1_type = ARG_PTR_TO_RAW_STACK,
.arg2_type = ARG_CONST_STACK_SIZE,
};
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index cb85d228b1ac..62c4ab07a7ac 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -11,7 +11,7 @@
* version 2 as published by the Free Software Foundation.
*/
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/magic.h>
#include <linux/major.h>
#include <linux/mount.h>
@@ -119,18 +119,10 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
return 0;
}
-static bool bpf_dname_reserved(const struct dentry *dentry)
-{
- return strchr(dentry->d_name.name, '.');
-}
-
static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct inode *inode;
- if (bpf_dname_reserved(dentry))
- return -EPERM;
-
inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -152,9 +144,6 @@ static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
{
struct inode *inode;
- if (bpf_dname_reserved(dentry))
- return -EPERM;
-
inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -187,11 +176,21 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
}
}
+static struct dentry *
+bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
+{
+ if (strchr(dentry->d_name.name, '.'))
+ return ERR_PTR(-EPERM);
+ return simple_lookup(dir, dentry, flags);
+}
+
static const struct inode_operations bpf_dir_iops = {
- .lookup = simple_lookup,
+ .lookup = bpf_lookup,
.mknod = bpf_mkobj,
.mkdir = bpf_mkdir,
.rmdir = simple_rmdir,
+ .rename2 = simple_rename,
+ .link = simple_link,
.unlink = simple_unlink,
};
@@ -256,7 +255,7 @@ out:
}
static void *bpf_obj_do_get(const struct filename *pathname,
- enum bpf_type *type)
+ enum bpf_type *type, int flags)
{
struct inode *inode;
struct path path;
@@ -268,7 +267,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
return ERR_PTR(ret);
inode = d_backing_inode(path.dentry);
- ret = inode_permission(inode, MAY_WRITE);
+ ret = inode_permission(inode, ACC_MODE(flags));
if (ret)
goto out;
@@ -287,18 +286,23 @@ out:
return ERR_PTR(ret);
}
-int bpf_obj_get_user(const char __user *pathname)
+int bpf_obj_get_user(const char __user *pathname, int flags)
{
enum bpf_type type = BPF_TYPE_UNSPEC;
struct filename *pname;
int ret = -ENOENT;
+ int f_flags;
void *raw;
+ f_flags = bpf_get_file_flag(flags);
+ if (f_flags < 0)
+ return f_flags;
+
pname = getname(pathname);
if (IS_ERR(pname))
return PTR_ERR(pname);
- raw = bpf_obj_do_get(pname, &type);
+ raw = bpf_obj_do_get(pname, &type, f_flags);
if (IS_ERR(raw)) {
ret = PTR_ERR(raw);
goto out;
@@ -307,7 +311,7 @@ int bpf_obj_get_user(const char __user *pathname)
if (type == BPF_TYPE_PROG)
ret = bpf_prog_new_fd(raw);
else if (type == BPF_TYPE_MAP)
- ret = bpf_map_new_fd(raw);
+ ret = bpf_map_new_fd(raw, f_flags);
else
goto out;
@@ -318,6 +322,42 @@ out:
return ret;
}
+static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
+{
+ struct bpf_prog *prog;
+ int ret = inode_permission(inode, MAY_READ);
+ if (ret)
+ return ERR_PTR(ret);
+
+ if (inode->i_op == &bpf_map_iops)
+ return ERR_PTR(-EINVAL);
+ if (inode->i_op != &bpf_prog_iops)
+ return ERR_PTR(-EACCES);
+
+ prog = inode->i_private;
+
+ ret = security_bpf_prog(prog);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ return bpf_prog_inc(prog);
+}
+
+struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
+{
+ struct bpf_prog *prog;
+ struct path path;
+ int ret = kern_path(name, LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ERR_PTR(ret);
+ prog = __get_prog_inode(d_backing_inode(path.dentry), type);
+ if (!IS_ERR(prog))
+ touch_atime(&path);
+ path_put(&path);
+ return prog;
+}
+EXPORT_SYMBOL(bpf_prog_get_type_path);
+
static void bpf_evict_inode(struct inode *inode)
{
enum bpf_type type;
@@ -368,8 +408,6 @@ static struct file_system_type bpf_fs_type = {
.kill_sb = kill_litter_super,
};
-MODULE_ALIAS_FS("bpf");
-
static int __init bpf_init(void)
{
int ret;
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
new file mode 100644
index 000000000000..673fa6fe2d73
--- /dev/null
+++ b/kernel/bpf/percpu_freelist.c
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include "percpu_freelist.h"
+
+int pcpu_freelist_init(struct pcpu_freelist *s)
+{
+ int cpu;
+
+ s->freelist = alloc_percpu(struct pcpu_freelist_head);
+ if (!s->freelist)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu);
+
+ raw_spin_lock_init(&head->lock);
+ head->first = NULL;
+ }
+ return 0;
+}
+
+void pcpu_freelist_destroy(struct pcpu_freelist *s)
+{
+ free_percpu(s->freelist);
+}
+
+static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
+ struct pcpu_freelist_node *node)
+{
+ raw_spin_lock(&head->lock);
+ node->next = head->first;
+ head->first = node;
+ raw_spin_unlock(&head->lock);
+}
+
+void pcpu_freelist_push(struct pcpu_freelist *s,
+ struct pcpu_freelist_node *node)
+{
+ struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
+
+ __pcpu_freelist_push(head, node);
+}
+
+void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
+ u32 nr_elems)
+{
+ struct pcpu_freelist_head *head;
+ unsigned long flags;
+ int i, cpu, pcpu_entries;
+
+ pcpu_entries = nr_elems / num_possible_cpus() + 1;
+ i = 0;
+
+ /* disable irq to workaround lockdep false positive
+ * in bpf usage pcpu_freelist_populate() will never race
+ * with pcpu_freelist_push()
+ */
+ local_irq_save(flags);
+ for_each_possible_cpu(cpu) {
+again:
+ head = per_cpu_ptr(s->freelist, cpu);
+ __pcpu_freelist_push(head, buf);
+ i++;
+ buf += elem_size;
+ if (i == nr_elems)
+ break;
+ if (i % pcpu_entries)
+ goto again;
+ }
+ local_irq_restore(flags);
+}
+
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+ struct pcpu_freelist_head *head;
+ struct pcpu_freelist_node *node;
+ unsigned long flags;
+ int orig_cpu, cpu;
+
+ local_irq_save(flags);
+ orig_cpu = cpu = raw_smp_processor_id();
+ while (1) {
+ head = per_cpu_ptr(s->freelist, cpu);
+ raw_spin_lock(&head->lock);
+ node = head->first;
+ if (node) {
+ head->first = node->next;
+ raw_spin_unlock_irqrestore(&head->lock, flags);
+ return node;
+ }
+ raw_spin_unlock(&head->lock);
+ cpu = cpumask_next(cpu, cpu_possible_mask);
+ if (cpu >= nr_cpu_ids)
+ cpu = 0;
+ if (cpu == orig_cpu) {
+ local_irq_restore(flags);
+ return NULL;
+ }
+ }
+}
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
new file mode 100644
index 000000000000..3049aae8ea1e
--- /dev/null
+++ b/kernel/bpf/percpu_freelist.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef __PERCPU_FREELIST_H__
+#define __PERCPU_FREELIST_H__
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+
+struct pcpu_freelist_head {
+ struct pcpu_freelist_node *first;
+ raw_spinlock_t lock;
+};
+
+struct pcpu_freelist {
+ struct pcpu_freelist_head __percpu *freelist;
+};
+
+struct pcpu_freelist_node {
+ struct pcpu_freelist_node *next;
+};
+
+void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *);
+void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
+ u32 nr_elems);
+int pcpu_freelist_init(struct pcpu_freelist *);
+void pcpu_freelist_destroy(struct pcpu_freelist *s);
+#endif
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
new file mode 100644
index 000000000000..22857ee8ce70
--- /dev/null
+++ b/kernel/bpf/stackmap.c
@@ -0,0 +1,293 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/stacktrace.h>
+#include <linux/perf_event.h>
+#include "percpu_freelist.h"
+
+#define STACK_CREATE_FLAG_MASK \
+ (BPF_F_RDONLY | BPF_F_WRONLY)
+
+struct stack_map_bucket {
+ struct pcpu_freelist_node fnode;
+ u32 hash;
+ u32 nr;
+ u64 ip[];
+};
+
+struct bpf_stack_map {
+ struct bpf_map map;
+ void *elems;
+ struct pcpu_freelist freelist;
+ u32 n_buckets;
+ struct stack_map_bucket *buckets[];
+};
+
+static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
+{
+ u64 elem_size = sizeof(struct stack_map_bucket) +
+ (u64)smap->map.value_size;
+ int err;
+
+ smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries);
+ if (!smap->elems)
+ return -ENOMEM;
+
+ err = pcpu_freelist_init(&smap->freelist);
+ if (err)
+ goto free_elems;
+
+ pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
+ smap->map.max_entries);
+ return 0;
+
+free_elems:
+ bpf_map_area_free(smap->elems);
+ return err;
+}
+
+/* Called from syscall */
+static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
+{
+ u32 value_size = attr->value_size;
+ struct bpf_stack_map *smap;
+ u64 cost, n_buckets;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+ value_size < 8 || value_size % 8 ||
+ value_size / 8 > sysctl_perf_event_max_stack)
+ return ERR_PTR(-EINVAL);
+
+ /* hash table size must be power of 2 */
+ n_buckets = roundup_pow_of_two(attr->max_entries);
+ if (!n_buckets)
+ return ERR_PTR(-E2BIG);
+
+ cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
+ if (cost >= U32_MAX - PAGE_SIZE)
+ return ERR_PTR(-E2BIG);
+
+ smap = bpf_map_area_alloc(cost);
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
+
+ err = -E2BIG;
+ cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
+ if (cost >= U32_MAX - PAGE_SIZE)
+ goto free_smap;
+
+ smap->map.map_type = attr->map_type;
+ smap->map.key_size = attr->key_size;
+ smap->map.value_size = value_size;
+ smap->map.max_entries = attr->max_entries;
+ smap->map.map_flags = attr->map_flags;
+ smap->n_buckets = n_buckets;
+ smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+ err = bpf_map_precharge_memlock(smap->map.pages);
+ if (err)
+ goto free_smap;
+
+ err = get_callchain_buffers(sysctl_perf_event_max_stack);
+ if (err)
+ goto free_smap;
+
+ err = prealloc_elems_and_freelist(smap);
+ if (err)
+ goto put_buffers;
+
+ return &smap->map;
+
+put_buffers:
+ put_callchain_buffers();
+free_smap:
+ bpf_map_area_free(smap);
+ return ERR_PTR(err);
+}
+
+BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
+ u64, flags)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ struct perf_callchain_entry *trace;
+ struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
+ u32 max_depth = map->value_size / 8;
+ /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+ u32 init_nr = sysctl_perf_event_max_stack - max_depth;
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+ u32 hash, id, trace_nr, trace_len;
+ bool user = flags & BPF_F_USER_STACK;
+ bool kernel = !user;
+ u64 *ips;
+
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+ BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+ return -EINVAL;
+
+ trace = get_perf_callchain(regs, init_nr, kernel, user,
+ sysctl_perf_event_max_stack, false, false);
+
+ if (unlikely(!trace))
+ /* couldn't fetch the stack trace */
+ return -EFAULT;
+
+ /* get_perf_callchain() guarantees that trace->nr >= init_nr
+ * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
+ */
+ trace_nr = trace->nr - init_nr;
+
+ if (trace_nr <= skip)
+ /* skipping more than usable stack trace */
+ return -EFAULT;
+
+ trace_nr -= skip;
+ trace_len = trace_nr * sizeof(u64);
+ ips = trace->ip + skip + init_nr;
+ hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
+ id = hash & (smap->n_buckets - 1);
+ bucket = READ_ONCE(smap->buckets[id]);
+
+ if (bucket && bucket->hash == hash) {
+ if (flags & BPF_F_FAST_STACK_CMP)
+ return id;
+ if (bucket->nr == trace_nr &&
+ memcmp(bucket->ip, ips, trace_len) == 0)
+ return id;
+ }
+
+ /* this call stack is not in the map, try to add it */
+ if (bucket && !(flags & BPF_F_REUSE_STACKID))
+ return -EEXIST;
+
+ new_bucket = (struct stack_map_bucket *)
+ pcpu_freelist_pop(&smap->freelist);
+ if (unlikely(!new_bucket))
+ return -ENOMEM;
+
+ memcpy(new_bucket->ip, ips, trace_len);
+ new_bucket->hash = hash;
+ new_bucket->nr = trace_nr;
+
+ old_bucket = xchg(&smap->buckets[id], new_bucket);
+ if (old_bucket)
+ pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
+ return id;
+}
+
+const struct bpf_func_proto bpf_get_stackid_proto = {
+ .func = bpf_get_stackid,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+};
+
+/* Called from eBPF program */
+static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return NULL;
+}
+
+/* Called from syscall */
+int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ struct stack_map_bucket *bucket, *old_bucket;
+ u32 id = *(u32 *)key, trace_len;
+
+ if (unlikely(id >= smap->n_buckets))
+ return -ENOENT;
+
+ bucket = xchg(&smap->buckets[id], NULL);
+ if (!bucket)
+ return -ENOENT;
+
+ trace_len = bucket->nr * sizeof(u64);
+ memcpy(value, bucket->ip, trace_len);
+ memset(value + trace_len, 0, map->value_size - trace_len);
+
+ old_bucket = xchg(&smap->buckets[id], bucket);
+ if (old_bucket)
+ pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
+ return 0;
+}
+
+static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -EINVAL;
+}
+
+static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
+{
+ return -EINVAL;
+}
+
+/* Called from syscall or from eBPF program */
+static int stack_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ struct stack_map_bucket *old_bucket;
+ u32 id = *(u32 *)key;
+
+ if (unlikely(id >= smap->n_buckets))
+ return -E2BIG;
+
+ old_bucket = xchg(&smap->buckets[id], NULL);
+ if (old_bucket) {
+ pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
+ return 0;
+ } else {
+ return -ENOENT;
+ }
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void stack_map_free(struct bpf_map *map)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+
+ /* wait for bpf programs to complete before freeing stack map */
+ synchronize_rcu();
+
+ bpf_map_area_free(smap->elems);
+ pcpu_freelist_destroy(&smap->freelist);
+ bpf_map_area_free(smap);
+ put_callchain_buffers();
+}
+
+static const struct bpf_map_ops stack_map_ops = {
+ .map_alloc = stack_map_alloc,
+ .map_free = stack_map_free,
+ .map_get_next_key = stack_map_get_next_key,
+ .map_lookup_elem = stack_map_lookup_elem,
+ .map_update_elem = stack_map_update_elem,
+ .map_delete_elem = stack_map_delete_elem,
+};
+
+static struct bpf_map_type_list stack_map_type __read_mostly = {
+ .ops = &stack_map_ops,
+ .type = BPF_MAP_TYPE_STACK_TRACE,
+};
+
+static int __init register_stack_map(void)
+{
+ bpf_register_map_type(&stack_map_type);
+ return 0;
+}
+late_initcall(register_stack_map);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 04fc1022ad9f..f65bd2dc07f8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -12,13 +12,20 @@
#include <linux/bpf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mmzone.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/license.h>
#include <linux/filter.h>
#include <linux/version.h>
-int sysctl_unprivileged_bpf_disabled __read_mostly;
+#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)
+
+DEFINE_PER_CPU(int, bpf_prog_active);
+
+int sysctl_unprivileged_bpf_disabled __read_mostly =
+ IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
static LIST_HEAD(bpf_map_types);
@@ -46,6 +53,43 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
list_add(&tl->list_node, &bpf_map_types);
}
+void *bpf_map_area_alloc(size_t size)
+{
+ /* We definitely need __GFP_NORETRY, so OOM killer doesn't
+ * trigger under memory pressure as we really just want to
+ * fail instead.
+ */
+ const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
+ void *area;
+
+ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+ area = kmalloc(size, GFP_USER | flags);
+ if (area != NULL)
+ return area;
+ }
+
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | flags,
+ PAGE_KERNEL);
+}
+
+void bpf_map_area_free(void *area)
+{
+ kvfree(area);
+}
+
+int bpf_map_precharge_memlock(u32 pages)
+{
+ struct user_struct *user = get_current_user();
+ unsigned long memlock_limit, cur;
+
+ memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ cur = atomic_long_read(&user->locked_vm);
+ free_uid(user);
+ if (cur + pages > memlock_limit)
+ return -EPERM;
+ return 0;
+}
+
static int bpf_map_charge_memlock(struct bpf_map *map)
{
struct user_struct *user = get_current_user();
@@ -78,6 +122,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
struct bpf_map *map = container_of(work, struct bpf_map, work);
bpf_map_uncharge_memlock(map);
+ security_bpf_map_free(map);
/* implementation dependent freeing */
map->ops->map_free(map);
}
@@ -109,18 +154,82 @@ void bpf_map_put_with_uref(struct bpf_map *map)
static int bpf_map_release(struct inode *inode, struct file *filp)
{
- bpf_map_put_with_uref(filp->private_data);
+ struct bpf_map *map = filp->private_data;
+
+ if (map->ops->map_release)
+ map->ops->map_release(map, filp);
+
+ bpf_map_put_with_uref(map);
return 0;
}
-static const struct file_operations bpf_map_fops = {
- .release = bpf_map_release,
+#ifdef CONFIG_PROC_FS
+static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+ const struct bpf_map *map = filp->private_data;
+
+ seq_printf(m,
+ "map_type:\t%u\n"
+ "key_size:\t%u\n"
+ "value_size:\t%u\n"
+ "max_entries:\t%u\n"
+ "map_flags:\t%#x\n",
+ map->map_type,
+ map->key_size,
+ map->value_size,
+ map->max_entries,
+ map->map_flags);
+}
+#endif
+
+static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
+ loff_t *ppos)
+{
+ /* We need this handler such that alloc_file() enables
+ * f_mode with FMODE_CAN_READ.
+ */
+ return -EINVAL;
+}
+
+static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
+ size_t siz, loff_t *ppos)
+{
+ /* We need this handler such that alloc_file() enables
+ * f_mode with FMODE_CAN_WRITE.
+ */
+ return -EINVAL;
+}
+
+const struct file_operations bpf_map_fops = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_map_show_fdinfo,
+#endif
+ .release = bpf_map_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
};
-int bpf_map_new_fd(struct bpf_map *map)
+int bpf_map_new_fd(struct bpf_map *map, int flags)
{
+ int ret;
+
+ ret = security_bpf_map(map, OPEN_FMODE(flags));
+ if (ret < 0)
+ return ret;
+
return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
- O_RDWR | O_CLOEXEC);
+ flags | O_CLOEXEC);
+}
+
+int bpf_get_file_flag(int flags)
+{
+ if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
+ return -EINVAL;
+ if (flags & BPF_F_RDONLY)
+ return O_RDONLY;
+ if (flags & BPF_F_WRONLY)
+ return O_WRONLY;
+ return O_RDWR;
}
/* helper macro to check that unused fields 'union bpf_attr' are zero */
@@ -131,17 +240,22 @@ int bpf_map_new_fd(struct bpf_map *map)
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
-#define BPF_MAP_CREATE_LAST_FIELD max_entries
+#define BPF_MAP_CREATE_LAST_FIELD map_flags
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
struct bpf_map *map;
+ int f_flags;
int err;
err = CHECK_ATTR(BPF_MAP_CREATE);
if (err)
return -EINVAL;
+ f_flags = bpf_get_file_flag(attr->map_flags);
+ if (f_flags < 0)
+ return f_flags;
+
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
map = find_and_alloc_map(attr);
if (IS_ERR(map))
@@ -150,11 +264,15 @@ static int map_create(union bpf_attr *attr)
atomic_set(&map->refcnt, 1);
atomic_set(&map->usercnt, 1);
+ err = security_bpf_map_alloc(map);
+ if (err)
+ goto free_map_nouncharge;
+
err = bpf_map_charge_memlock(map);
if (err)
- goto free_map;
+ goto free_map_sec;
- err = bpf_map_new_fd(map);
+ err = bpf_map_new_fd(map, f_flags);
if (err < 0)
/* failed to allocate fd */
goto free_map;
@@ -162,6 +280,10 @@ static int map_create(union bpf_attr *attr)
return err;
free_map:
+ bpf_map_uncharge_memlock(map);
+free_map_sec:
+ security_bpf_map_free(map);
+free_map_nouncharge:
map->ops->map_free(map);
return err;
}
@@ -216,6 +338,11 @@ static void __user *u64_to_ptr(__u64 val)
return (void __user *) (unsigned long) val;
}
+int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
+{
+ return -ENOTSUPP;
+}
+
/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
@@ -226,6 +353,7 @@ static int map_lookup_elem(union bpf_attr *attr)
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value, *ptr;
+ u32 value_size;
struct fd f;
int err;
@@ -237,6 +365,11 @@ static int map_lookup_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
err = -ENOMEM;
key = kmalloc(map->key_size, GFP_USER);
if (!key)
@@ -246,23 +379,37 @@ static int map_lookup_elem(union bpf_attr *attr)
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ value_size = round_up(map->value_size, 8) * num_possible_cpus();
+ else
+ value_size = map->value_size;
+
err = -ENOMEM;
- value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+ value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
- rcu_read_lock();
- ptr = map->ops->map_lookup_elem(map, key);
- if (ptr)
- memcpy(value, ptr, map->value_size);
- rcu_read_unlock();
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+ err = bpf_percpu_hash_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ err = bpf_percpu_array_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
+ err = bpf_stackmap_copy(map, key, value);
+ } else {
+ rcu_read_lock();
+ ptr = map->ops->map_lookup_elem(map, key);
+ if (ptr)
+ memcpy(value, ptr, value_size);
+ rcu_read_unlock();
+ err = ptr ? 0 : -ENOENT;
+ }
- err = -ENOENT;
- if (!ptr)
+ if (err)
goto free_value;
err = -EFAULT;
- if (copy_to_user(uvalue, value, map->value_size) != 0)
+ if (copy_to_user(uvalue, value, value_size) != 0)
goto free_value;
err = 0;
@@ -285,6 +432,7 @@ static int map_update_elem(union bpf_attr *attr)
int ufd = attr->map_fd;
struct bpf_map *map;
void *key, *value;
+ u32 value_size;
struct fd f;
int err;
@@ -296,6 +444,11 @@ static int map_update_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
err = -ENOMEM;
key = kmalloc(map->key_size, GFP_USER);
if (!key)
@@ -305,21 +458,44 @@ static int map_update_elem(union bpf_attr *attr)
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ value_size = round_up(map->value_size, 8) * num_possible_cpus();
+ else
+ value_size = map->value_size;
+
err = -ENOMEM;
- value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN);
+ value = kmalloc(value_size, GFP_USER | __GFP_NOWARN);
if (!value)
goto free_key;
err = -EFAULT;
- if (copy_from_user(value, uvalue, map->value_size) != 0)
+ if (copy_from_user(value, uvalue, value_size) != 0)
goto free_value;
- /* eBPF program that use maps are running under rcu_read_lock(),
- * therefore all map accessors rely on this fact, so do the same here
+ /* must increment bpf_prog_active to avoid kprobe+bpf triggering from
+ * inside bpf map update or delete otherwise deadlocks are possible
*/
- rcu_read_lock();
- err = map->ops->map_update_elem(map, key, value, attr->flags);
- rcu_read_unlock();
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+ err = bpf_percpu_hash_update(map, key, value, attr->flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ err = bpf_percpu_array_update(map, key, value, attr->flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) {
+ rcu_read_lock();
+ err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+ attr->flags);
+ rcu_read_unlock();
+ } else {
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value, attr->flags);
+ rcu_read_unlock();
+ }
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
free_value:
kfree(value);
@@ -349,6 +525,11 @@ static int map_delete_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
err = -ENOMEM;
key = kmalloc(map->key_size, GFP_USER);
if (!key)
@@ -358,9 +539,13 @@ static int map_delete_elem(union bpf_attr *attr)
if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key;
+ preempt_disable();
+ __this_cpu_inc(bpf_prog_active);
rcu_read_lock();
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
+ __this_cpu_dec(bpf_prog_active);
+ preempt_enable();
free_key:
kfree(key);
@@ -390,6 +575,11 @@ static int map_get_next_key(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
if (ukey) {
err = -ENOMEM;
key = kmalloc(map->key_size, GFP_USER);
@@ -462,19 +652,39 @@ static void free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
}
+int __bpf_prog_charge(struct user_struct *user, u32 pages)
+{
+ unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ unsigned long user_bufs;
+
+ if (user) {
+ user_bufs = atomic_long_add_return(pages, &user->locked_vm);
+ if (user_bufs > memlock_limit) {
+ atomic_long_sub(pages, &user->locked_vm);
+ return -EPERM;
+ }
+ }
+
+ return 0;
+}
+
+void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
+{
+ if (user)
+ atomic_long_sub(pages, &user->locked_vm);
+}
+
static int bpf_prog_charge_memlock(struct bpf_prog *prog)
{
struct user_struct *user = get_current_user();
- unsigned long memlock_limit;
-
- memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ int ret;
- atomic_long_add(prog->pages, &user->locked_vm);
- if (atomic_long_read(&user->locked_vm) > memlock_limit) {
- atomic_long_sub(prog->pages, &user->locked_vm);
+ ret = __bpf_prog_charge(user, prog->pages);
+ if (ret) {
free_uid(user);
- return -EPERM;
+ return ret;
}
+
prog->aux->user = user;
return 0;
}
@@ -483,7 +693,7 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
{
struct user_struct *user = prog->aux->user;
- atomic_long_sub(prog->pages, &user->locked_vm);
+ __bpf_prog_uncharge(user, prog->pages);
free_uid(user);
}
@@ -493,6 +703,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
free_used_maps(aux);
bpf_prog_uncharge_memlock(aux->prog);
+ security_bpf_prog_free(aux);
bpf_prog_free(aux->prog);
}
@@ -511,17 +722,25 @@ static int bpf_prog_release(struct inode *inode, struct file *filp)
return 0;
}
-static const struct file_operations bpf_prog_fops = {
+const struct file_operations bpf_prog_fops = {
.release = bpf_prog_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
};
int bpf_prog_new_fd(struct bpf_prog *prog)
{
+ int ret;
+
+ ret = security_bpf_prog(prog);
+ if (ret < 0)
+ return ret;
+
return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
O_RDWR | O_CLOEXEC);
}
-static struct bpf_prog *__bpf_prog_get(struct fd f)
+static struct bpf_prog *____bpf_prog_get(struct fd f)
{
if (!f.file)
return ERR_PTR(-EBADF);
@@ -533,33 +752,50 @@ static struct bpf_prog *__bpf_prog_get(struct fd f)
return f.file->private_data;
}
-struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i)
{
- if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
- atomic_dec(&prog->aux->refcnt);
+ if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) {
+ atomic_sub(i, &prog->aux->refcnt);
return ERR_PTR(-EBUSY);
}
return prog;
}
+EXPORT_SYMBOL_GPL(bpf_prog_add);
-/* called by sockets/tracing/seccomp before attaching program to an event
- * pairs with bpf_prog_put()
- */
-struct bpf_prog *bpf_prog_get(u32 ufd)
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+{
+ return bpf_prog_add(prog, 1);
+}
+
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
{
struct fd f = fdget(ufd);
struct bpf_prog *prog;
- prog = __bpf_prog_get(f);
+ prog = ____bpf_prog_get(f);
if (IS_ERR(prog))
return prog;
+ if (type && prog->type != *type) {
+ prog = ERR_PTR(-EINVAL);
+ goto out;
+ }
prog = bpf_prog_inc(prog);
+out:
fdput(f);
-
return prog;
}
-EXPORT_SYMBOL_GPL(bpf_prog_get);
+
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+ return __bpf_prog_get(ufd, NULL);
+}
+
+struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
+{
+ return __bpf_prog_get(ufd, &type);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_get_type);
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD kern_version
@@ -591,7 +827,9 @@ static int bpf_prog_load(union bpf_attr *attr)
attr->kern_version != LINUX_VERSION_CODE)
return -EINVAL;
- if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+ if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
+ type != BPF_PROG_TYPE_CGROUP_SKB &&
+ !capable(CAP_SYS_ADMIN))
return -EPERM;
/* plain bpf_prog allocation */
@@ -599,10 +837,14 @@ static int bpf_prog_load(union bpf_attr *attr)
if (!prog)
return -ENOMEM;
- err = bpf_prog_charge_memlock(prog);
+ err = security_bpf_prog_alloc(prog->aux);
if (err)
goto free_prog_nouncharge;
+ err = bpf_prog_charge_memlock(prog);
+ if (err)
+ goto free_prog_sec;
+
prog->len = attr->insn_cnt;
err = -EFAULT;
@@ -627,7 +869,7 @@ static int bpf_prog_load(union bpf_attr *attr)
goto free_used_maps;
/* eBPF program is ready to be JITed */
- err = bpf_prog_select_runtime(prog);
+ prog = bpf_prog_select_runtime(prog, &err);
if (err < 0)
goto free_used_maps;
@@ -642,16 +884,18 @@ free_used_maps:
free_used_maps(prog->aux);
free_prog:
bpf_prog_uncharge_memlock(prog);
+free_prog_sec:
+ security_bpf_prog_free(prog->aux);
free_prog_nouncharge:
bpf_prog_free(prog);
return err;
}
-#define BPF_OBJ_LAST_FIELD bpf_fd
+#define BPF_OBJ_LAST_FIELD file_flags
static int bpf_obj_pin(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ))
+ if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
return -EINVAL;
return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname));
@@ -659,15 +903,127 @@ static int bpf_obj_pin(const union bpf_attr *attr)
static int bpf_obj_get(const union bpf_attr *attr)
{
- if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+ if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
+ attr->file_flags & ~BPF_OBJ_FLAG_MASK)
+ return -EINVAL;
+
+ return bpf_obj_get_user(u64_to_ptr(attr->pathname),
+ attr->file_flags);
+}
+
+#ifdef CONFIG_CGROUP_BPF
+
+#define BPF_PROG_ATTACH_LAST_FIELD attach_flags
+
+#define BPF_F_ATTACH_MASK \
+ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+
+static int bpf_prog_attach(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ struct cgroup *cgrp;
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (CHECK_ATTR(BPF_PROG_ATTACH))
+ return -EINVAL;
+
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
+ return -EINVAL;
+
+ switch (attr->attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ prog = bpf_prog_get_type(attr->attach_bpf_fd,
+ BPF_PROG_TYPE_CGROUP_SKB);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp)) {
+ bpf_prog_put(prog);
+ return PTR_ERR(cgrp);
+ }
+
+ ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+ attr->attach_flags);
+ if (ret)
+ bpf_prog_put(prog);
+ cgroup_put(cgrp);
+ break;
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ prog = bpf_prog_get_type(attr->attach_bpf_fd,
+ BPF_PROG_TYPE_CGROUP_SOCK);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp)) {
+ bpf_prog_put(prog);
+ return PTR_ERR(cgrp);
+ }
+
+ ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+ attr->attach_flags);
+ if (ret)
+ bpf_prog_put(prog);
+ cgroup_put(cgrp);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return ret;
+}
+
+#define BPF_PROG_DETACH_LAST_FIELD attach_type
+
+static int bpf_prog_detach(const union bpf_attr *attr)
+{
+ enum bpf_prog_type ptype;
+ struct bpf_prog *prog;
+ struct cgroup *cgrp;
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (CHECK_ATTR(BPF_PROG_DETACH))
+ return -EINVAL;
+
+ switch (attr->attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ ptype = BPF_PROG_TYPE_CGROUP_SKB;
+ break;
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+ break;
+ default:
return -EINVAL;
+ }
+
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
- return bpf_obj_get_user(u64_to_ptr(attr->pathname));
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ prog = NULL;
+
+ ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+ if (prog)
+ bpf_prog_put(prog);
+ cgroup_put(cgrp);
+ return ret;
}
+#endif /* CONFIG_CGROUP_BPF */
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
- union bpf_attr attr = {};
+ union bpf_attr attr;
int err;
if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
@@ -703,9 +1059,14 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
}
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
+ memset(&attr, 0, sizeof(attr));
if (copy_from_user(&attr, uattr, size) != 0)
return -EFAULT;
+ err = security_bpf(cmd, &attr, size);
+ if (err < 0)
+ return err;
+
switch (cmd) {
case BPF_MAP_CREATE:
err = map_create(&attr);
@@ -731,6 +1092,16 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_OBJ_GET:
err = bpf_obj_get(&attr);
break;
+
+#ifdef CONFIG_CGROUP_BPF
+ case BPF_PROG_ATTACH:
+ err = bpf_prog_attach(&attr);
+ break;
+ case BPF_PROG_DETACH:
+ err = bpf_prog_detach(&attr);
+ break;
+#endif
+
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c43ca9857479..0bdb7c1b558d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1,4 +1,5 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -13,6 +14,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
#include <linux/filter.h>
#include <net/netlink.h>
#include <linux/file.h>
@@ -125,91 +127,27 @@
* are set to NOT_INIT to indicate that they are no longer readable.
*/
-/* types of values stored in eBPF registers */
-enum bpf_reg_type {
- NOT_INIT = 0, /* nothing was written into register */
- UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */
- PTR_TO_CTX, /* reg points to bpf_context */
- CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
- PTR_TO_MAP_VALUE, /* reg points to map element value */
- PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
- FRAME_PTR, /* reg == frame_pointer */
- PTR_TO_STACK, /* reg == frame_pointer + imm */
- CONST_IMM, /* constant integer value */
-};
-
-struct reg_state {
- enum bpf_reg_type type;
- union {
- /* valid when type == CONST_IMM | PTR_TO_STACK */
- int imm;
-
- /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
- * PTR_TO_MAP_VALUE_OR_NULL
- */
- struct bpf_map *map_ptr;
- };
-};
-
-enum bpf_stack_slot_type {
- STACK_INVALID, /* nothing was stored in this stack slot */
- STACK_SPILL, /* register spilled into stack */
- STACK_MISC /* BPF program wrote some data into this slot */
-};
-
-#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
-
-/* state of the program:
- * type of all registers and stack info
- */
-struct verifier_state {
- struct reg_state regs[MAX_BPF_REG];
- u8 stack_slot_type[MAX_BPF_STACK];
- struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
-};
-
-/* linked list of verifier states used to prune search */
-struct verifier_state_list {
- struct verifier_state state;
- struct verifier_state_list *next;
-};
-
/* verifier_state + insn_idx are pushed to stack when branch is encountered */
-struct verifier_stack_elem {
+struct bpf_verifier_stack_elem {
/* verifer state is 'st'
* before processing instruction 'insn_idx'
* and after processing instruction 'prev_insn_idx'
*/
- struct verifier_state st;
+ struct bpf_verifier_state st;
int insn_idx;
int prev_insn_idx;
- struct verifier_stack_elem *next;
-};
-
-struct bpf_insn_aux_data {
- union {
- enum bpf_reg_type ptr_type; /* pointer type for load/store insns */
- struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */
- };
- int sanitize_stack_off; /* stack slot to be cleared */
- bool seen; /* this insn was processed by the verifier */
+ struct bpf_verifier_stack_elem *next;
};
-#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
+#define BPF_COMPLEXITY_LIMIT_INSNS 98304
+#define BPF_COMPLEXITY_LIMIT_STACK 1024
-/* single container for all structs
- * one verifier_env per bpf_check() call
- */
-struct verifier_env {
- struct bpf_prog *prog; /* eBPF program being verified */
- struct verifier_stack_elem *head; /* stack of verifier states to be processed */
- int stack_size; /* number of states to be processed */
- struct verifier_state cur_state; /* current verifier state */
- struct verifier_state_list **explored_states; /* search pruning optimization */
- struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
- u32 used_map_cnt; /* number of used maps */
- bool allow_ptr_leaks;
- struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
+struct bpf_call_arg_meta {
+ struct bpf_map *map_ptr;
+ bool raw_mode;
+ bool pkt_access;
+ int regno;
+ int access_size;
};
/* verbose verifier prints what it's seeing
@@ -244,33 +182,51 @@ static const char * const reg_type_str[] = {
[CONST_PTR_TO_MAP] = "map_ptr",
[PTR_TO_MAP_VALUE] = "map_value",
[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
+ [PTR_TO_MAP_VALUE_ADJ] = "map_value_adj",
[FRAME_PTR] = "fp",
[PTR_TO_STACK] = "fp",
[CONST_IMM] = "imm",
+ [PTR_TO_PACKET] = "pkt",
+ [PTR_TO_PACKET_END] = "pkt_end",
};
-static void print_verifier_state(struct verifier_env *env)
+static void print_verifier_state(struct bpf_verifier_state *state)
{
+ struct bpf_reg_state *reg;
enum bpf_reg_type t;
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
- t = env->cur_state.regs[i].type;
+ reg = &state->regs[i];
+ t = reg->type;
if (t == NOT_INIT)
continue;
verbose(" R%d=%s", i, reg_type_str[t]);
if (t == CONST_IMM || t == PTR_TO_STACK)
- verbose("%d", env->cur_state.regs[i].imm);
+ verbose("%lld", reg->imm);
+ else if (t == PTR_TO_PACKET)
+ verbose("(id=%d,off=%d,r=%d)",
+ reg->id, reg->off, reg->range);
+ else if (t == UNKNOWN_VALUE && reg->imm)
+ verbose("%lld", reg->imm);
else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
- t == PTR_TO_MAP_VALUE_OR_NULL)
- verbose("(ks=%d,vs=%d)",
- env->cur_state.regs[i].map_ptr->key_size,
- env->cur_state.regs[i].map_ptr->value_size);
+ t == PTR_TO_MAP_VALUE_OR_NULL ||
+ t == PTR_TO_MAP_VALUE_ADJ)
+ verbose("(ks=%d,vs=%d,id=%u)",
+ reg->map_ptr->key_size,
+ reg->map_ptr->value_size,
+ reg->id);
+ if (reg->min_value != BPF_REGISTER_MIN_RANGE)
+ verbose(",min_value=%lld",
+ (long long)reg->min_value);
+ if (reg->max_value != BPF_REGISTER_MAX_RANGE)
+ verbose(",max_value=%llu",
+ (unsigned long long)reg->max_value);
}
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
+ if (state->stack_slot_type[i] == STACK_SPILL)
verbose(" fp%d=%s", -MAX_BPF_STACK + i,
- reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
+ reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
}
verbose("\n");
}
@@ -323,7 +279,7 @@ static const char *const bpf_jmp_string[16] = {
[BPF_EXIT >> 4] = "exit",
};
-static void print_bpf_insn(const struct verifier_env *env,
+static void print_bpf_insn(const struct bpf_verifier_env *env,
const struct bpf_insn *insn)
{
u8 class = BPF_CLASS(insn->code);
@@ -431,9 +387,9 @@ static void print_bpf_insn(const struct verifier_env *env,
}
}
-static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
+static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
{
- struct verifier_stack_elem *elem;
+ struct bpf_verifier_stack_elem *elem;
int insn_idx;
if (env->head == NULL)
@@ -450,12 +406,12 @@ static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
return insn_idx;
}
-static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
- int prev_insn_idx)
+static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
+ int insn_idx, int prev_insn_idx)
{
- struct verifier_stack_elem *elem;
+ struct bpf_verifier_stack_elem *elem;
- elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
+ elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
if (!elem)
goto err;
@@ -465,7 +421,7 @@ static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
elem->next = env->head;
env->head = elem;
env->stack_size++;
- if (env->stack_size > 1024) {
+ if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
verbose("BPF program is too complex\n");
goto err;
}
@@ -481,14 +437,15 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void init_reg_state(struct reg_state *regs)
+static void init_reg_state(struct bpf_reg_state *regs)
{
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
regs[i].type = NOT_INIT;
regs[i].imm = 0;
- regs[i].map_ptr = NULL;
+ regs[i].min_value = BPF_REGISTER_MIN_RANGE;
+ regs[i].max_value = BPF_REGISTER_MAX_RANGE;
}
/* frame pointer */
@@ -498,12 +455,23 @@ static void init_reg_state(struct reg_state *regs)
regs[BPF_REG_1].type = PTR_TO_CTX;
}
-static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
+static void __mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
{
- BUG_ON(regno >= MAX_BPF_REG);
regs[regno].type = UNKNOWN_VALUE;
+ regs[regno].id = 0;
regs[regno].imm = 0;
- regs[regno].map_ptr = NULL;
+}
+
+static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+{
+ BUG_ON(regno >= MAX_BPF_REG);
+ __mark_reg_unknown_value(regs, regno);
+}
+
+static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
+{
+ regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
+ regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
}
enum reg_arg_type {
@@ -512,7 +480,7 @@ enum reg_arg_type {
DST_OP_NO_MARK /* same as above, check only, don't mark */
};
-static int check_reg_arg(struct reg_state *regs, u32 regno,
+static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
enum reg_arg_type t)
{
if (regno >= MAX_BPF_REG) {
@@ -559,6 +527,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_MAP_VALUE_OR_NULL:
case PTR_TO_STACK:
case PTR_TO_CTX:
+ case PTR_TO_PACKET:
+ case PTR_TO_PACKET_END:
case FRAME_PTR:
case CONST_PTR_TO_MAP:
return true;
@@ -570,8 +540,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
/* check_stack_read/write functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
-static int check_stack_write(struct verifier_env *env,
- struct verifier_state *state, int off,
+static int check_stack_write(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *state, int off,
int size, int value_regno, int insn_idx)
{
int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
@@ -619,7 +589,7 @@ static int check_stack_write(struct verifier_env *env,
}
} else {
/* regular write of data into stack */
- state->spilled_regs[spi] = (struct reg_state) {};
+ state->spilled_regs[spi] = (struct bpf_reg_state) {};
for (i = 0; i < size; i++)
state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
@@ -627,7 +597,7 @@ static int check_stack_write(struct verifier_env *env,
return 0;
}
-static int check_stack_read(struct verifier_state *state, int off, int size,
+static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
int value_regno)
{
u8 *slot_type;
@@ -668,7 +638,7 @@ static int check_stack_read(struct verifier_state *state, int off, int size,
}
/* check read/write into map element returned by bpf_map_lookup_elem() */
-static int check_map_access(struct verifier_env *env, u32 regno, int off,
+static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
int size)
{
struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
@@ -681,24 +651,67 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
return 0;
}
+#define MAX_PACKET_OFF 0xffff
+
+static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
+ const struct bpf_call_arg_meta *meta)
+{
+ switch (env->prog->type) {
+ case BPF_PROG_TYPE_SCHED_CLS:
+ case BPF_PROG_TYPE_SCHED_ACT:
+ case BPF_PROG_TYPE_XDP:
+ if (meta)
+ return meta->pkt_access;
+
+ env->seen_direct_write = true;
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
+ int size)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *reg = &regs[regno];
+
+ off += reg->off;
+ if (off < 0 || size <= 0 || off + size > reg->range) {
+ verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+ off, size, regno, reg->id, reg->off, reg->range);
+ return -EACCES;
+ }
+ return 0;
+}
+
/* check access to 'struct bpf_context' fields */
-static int check_ctx_access(struct verifier_env *env, int off, int size,
- enum bpf_access_type t)
+static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
+ enum bpf_access_type t, enum bpf_reg_type *reg_type)
{
+ /* for analyzer ctx accesses are already validated and converted */
+ if (env->analyzer_ops)
+ return 0;
+
if (env->prog->aux->ops->is_valid_access &&
- env->prog->aux->ops->is_valid_access(off, size, t))
+ env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {
+ /* remember the offset of last byte accessed in ctx */
+ if (env->prog->aux->max_ctx_offset < off + size)
+ env->prog->aux->max_ctx_offset = off + size;
return 0;
+ }
verbose("invalid bpf_context access off=%d size=%d\n", off, size);
return -EACCES;
}
-static bool is_pointer_value(struct verifier_env *env, int regno)
+static bool __is_pointer_value(bool allow_ptr_leaks,
+ const struct bpf_reg_state *reg)
{
- if (env->allow_ptr_leaks)
+ if (allow_ptr_leaks)
return false;
- switch (env->cur_state.regs[regno].type) {
+ switch (reg->type) {
case UNKNOWN_VALUE:
case CONST_IMM:
return false;
@@ -707,60 +720,141 @@ static bool is_pointer_value(struct verifier_env *env, int regno)
}
}
-static bool is_ctx_reg(struct verifier_env *env, int regno)
+static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
+{
+ return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
+}
+
+static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
{
- const struct reg_state *reg = &env->cur_state.regs[regno];
+ const struct bpf_reg_state *reg = &env->cur_state.regs[regno];
return reg->type == PTR_TO_CTX;
}
+static int check_ptr_alignment(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int off, int size)
+{
+ if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_MAP_VALUE_ADJ) {
+ if (off % size != 0) {
+ verbose("misaligned access off %d size %d\n",
+ off, size);
+ return -EACCES;
+ } else {
+ return 0;
+ }
+ }
+
+ if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
+ /* misaligned access to packet is ok on x86,arm,arm64 */
+ return 0;
+
+ if (reg->id && size != 1) {
+ verbose("Unknown packet alignment. Only byte-sized access allowed\n");
+ return -EACCES;
+ }
+
+ /* skb->data is NET_IP_ALIGN-ed */
+ if (reg->type == PTR_TO_PACKET &&
+ (NET_IP_ALIGN + reg->off + off) % size != 0) {
+ verbose("misaligned packet access off %d+%d+%d size %d\n",
+ NET_IP_ALIGN, reg->off, off, size);
+ return -EACCES;
+ }
+ return 0;
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
* if t==write && value_regno==-1, some unknown value is stored into memory
* if t==read && value_regno==-1, don't care what we read from memory
*/
-static int check_mem_access(struct verifier_env *env, int insn_idx, u32 regno, int off,
+static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off,
int bpf_size, enum bpf_access_type t,
int value_regno)
{
- struct verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_reg_state *reg = &state->regs[regno];
int size, err = 0;
- if (state->regs[regno].type == PTR_TO_STACK)
- off += state->regs[regno].imm;
+ if (reg->type == PTR_TO_STACK)
+ off += reg->imm;
size = bpf_size_to_bytes(bpf_size);
if (size < 0)
return size;
- if (off % size != 0) {
- verbose("misaligned access off %d size %d\n", off, size);
- return -EACCES;
- }
+ err = check_ptr_alignment(env, reg, off, size);
+ if (err)
+ return err;
- if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+ if (reg->type == PTR_TO_MAP_VALUE ||
+ reg->type == PTR_TO_MAP_VALUE_ADJ) {
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into map\n", value_regno);
return -EACCES;
}
+
+ /* If we adjusted the register to this map value at all then we
+ * need to change off and size to min_value and max_value
+ * respectively to make sure our theoretical access will be
+ * safe.
+ */
+ if (reg->type == PTR_TO_MAP_VALUE_ADJ) {
+ if (log_level)
+ print_verifier_state(state);
+ env->varlen_map_value_access = true;
+ /* The minimum value is only important with signed
+ * comparisons where we can't assume the floor of a
+ * value is 0. If we are using signed variables for our
+ * index'es we need to make sure that whatever we use
+ * will have a set floor within our range.
+ */
+ if (reg->min_value < 0) {
+ verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_map_access(env, regno, reg->min_value + off,
+ size);
+ if (err) {
+ verbose("R%d min value is outside of the array range\n",
+ regno);
+ return err;
+ }
+
+ /* If we haven't set a max value then we need to bail
+ * since we can't be sure we won't do bad things.
+ */
+ if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
+ verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+ regno);
+ return -EACCES;
+ }
+ off += reg->max_value;
+ }
err = check_map_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
- } else if (state->regs[regno].type == PTR_TO_CTX) {
+ } else if (reg->type == PTR_TO_CTX) {
+ enum bpf_reg_type reg_type = UNKNOWN_VALUE;
+
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose("R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
- err = check_ctx_access(env, off, size, t);
- if (!err && t == BPF_READ && value_regno >= 0)
+ err = check_ctx_access(env, off, size, t, &reg_type);
+ if (!err && t == BPF_READ && value_regno >= 0) {
mark_reg_unknown_value(state->regs, value_regno);
+ /* note that reg.[id|off|range] == 0 */
+ state->regs[value_regno].type = reg_type;
+ }
- } else if (state->regs[regno].type == FRAME_PTR ||
- state->regs[regno].type == PTR_TO_STACK) {
+ } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
if (off >= 0 || off < -MAX_BPF_STACK) {
verbose("invalid stack off=%d size=%d\n", off, size);
return -EACCES;
@@ -777,17 +871,39 @@ static int check_mem_access(struct verifier_env *env, int insn_idx, u32 regno, i
} else {
err = check_stack_read(state, off, size, value_regno);
}
+ } else if (state->regs[regno].type == PTR_TO_PACKET) {
+ if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL)) {
+ verbose("cannot write into packet\n");
+ return -EACCES;
+ }
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose("R%d leaks addr into packet\n", value_regno);
+ return -EACCES;
+ }
+ err = check_packet_access(env, regno, off, size);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown_value(state->regs, value_regno);
} else {
verbose("R%d invalid mem access '%s'\n",
- regno, reg_type_str[state->regs[regno].type]);
+ regno, reg_type_str[reg->type]);
return -EACCES;
}
+
+ if (!err && size <= 2 && value_regno >= 0 && env->allow_ptr_leaks &&
+ state->regs[value_regno].type == UNKNOWN_VALUE) {
+ /* 1 or 2 byte load zero-extends, determine the number of
+ * zero upper bits. Not doing it fo 4 byte load, since
+ * such values cannot be added to ptr_to_packet anyway.
+ */
+ state->regs[value_regno].imm = 64 - size * 8;
+ }
return err;
}
-static int check_xadd(struct verifier_env *env, int insn_idx, struct bpf_insn *insn)
+static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state.regs;
int err;
if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
@@ -832,15 +948,25 @@ static int check_xadd(struct verifier_env *env, int insn_idx, struct bpf_insn *i
* bytes from that pointer, make sure that it's within stack boundary
* and all elements of stack are initialized
*/
-static int check_stack_boundary(struct verifier_env *env,
- int regno, int access_size)
+static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
+ int access_size, bool zero_size_allowed,
+ struct bpf_call_arg_meta *meta)
{
- struct verifier_state *state = &env->cur_state;
- struct reg_state *regs = state->regs;
+ struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_reg_state *regs = state->regs;
int off, i;
- if (regs[regno].type != PTR_TO_STACK)
+ if (regs[regno].type != PTR_TO_STACK) {
+ if (zero_size_allowed && access_size == 0 &&
+ regs[regno].type == CONST_IMM &&
+ regs[regno].imm == 0)
+ return 0;
+
+ verbose("R%d type=%s expected=%s\n", regno,
+ reg_type_str[regs[regno].type],
+ reg_type_str[PTR_TO_STACK]);
return -EACCES;
+ }
off = regs[regno].imm;
if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
@@ -850,6 +976,12 @@ static int check_stack_boundary(struct verifier_env *env,
return -EACCES;
}
+ if (meta && meta->raw_mode) {
+ meta->access_size = access_size;
+ meta->regno = regno;
+ return 0;
+ }
+
for (i = 0; i < access_size; i++) {
if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
verbose("invalid indirect read from stack off %d+%d size %d\n",
@@ -860,17 +992,18 @@ static int check_stack_boundary(struct verifier_env *env,
return 0;
}
-static int check_func_arg(struct verifier_env *env, u32 regno,
- enum bpf_arg_type arg_type, struct bpf_map **mapp)
+static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
+ enum bpf_arg_type arg_type,
+ struct bpf_call_arg_meta *meta)
{
- struct reg_state *reg = env->cur_state.regs + regno;
- enum bpf_reg_type expected_type;
+ struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+ enum bpf_reg_type expected_type, type = reg->type;
int err = 0;
if (arg_type == ARG_DONTCARE)
return 0;
- if (reg->type == NOT_INIT) {
+ if (type == NOT_INIT) {
verbose("R%d !read_ok\n", regno);
return -EACCES;
}
@@ -883,36 +1016,55 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
return 0;
}
- if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
+ if (type == PTR_TO_PACKET && !may_access_direct_pkt_data(env, meta)) {
+ verbose("helper access to the packet is not allowed\n");
+ return -EACCES;
+ }
+
+ if (arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
- } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ if (type != PTR_TO_PACKET && type != expected_type)
+ goto err_type;
+ } else if (arg_type == ARG_CONST_STACK_SIZE ||
+ arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
expected_type = CONST_IMM;
+ if (type != expected_type)
+ goto err_type;
} else if (arg_type == ARG_CONST_MAP_PTR) {
expected_type = CONST_PTR_TO_MAP;
+ if (type != expected_type)
+ goto err_type;
} else if (arg_type == ARG_PTR_TO_CTX) {
expected_type = PTR_TO_CTX;
+ if (type != expected_type)
+ goto err_type;
+ } else if (arg_type == ARG_PTR_TO_STACK ||
+ arg_type == ARG_PTR_TO_RAW_STACK) {
+ expected_type = PTR_TO_STACK;
+ /* One exception here. In case function allows for NULL to be
+ * passed in as argument, it's a CONST_IMM type. Final test
+ * happens during stack boundary checking.
+ */
+ if (type == CONST_IMM && reg->imm == 0)
+ /* final test in check_stack_boundary() */;
+ else if (type != PTR_TO_PACKET && type != expected_type)
+ goto err_type;
+ meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK;
} else {
verbose("unsupported arg_type %d\n", arg_type);
return -EFAULT;
}
- if (reg->type != expected_type) {
- verbose("R%d type=%s expected=%s\n", regno,
- reg_type_str[reg->type], reg_type_str[expected_type]);
- return -EACCES;
- }
-
if (arg_type == ARG_CONST_MAP_PTR) {
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
- *mapp = reg->map_ptr;
-
+ meta->map_ptr = reg->map_ptr;
} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
* stack limits and initialized
*/
- if (!*mapp) {
+ if (!meta->map_ptr) {
/* in function declaration map_ptr must come before
* map_key, so that it's verified and known before
* we have to check map_key here. Otherwise it means
@@ -921,20 +1073,33 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
verbose("invalid map_ptr to access map->key\n");
return -EACCES;
}
- err = check_stack_boundary(env, regno, (*mapp)->key_size);
-
+ if (type == PTR_TO_PACKET)
+ err = check_packet_access(env, regno, 0,
+ meta->map_ptr->key_size);
+ else
+ err = check_stack_boundary(env, regno,
+ meta->map_ptr->key_size,
+ false, NULL);
} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
/* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity
*/
- if (!*mapp) {
+ if (!meta->map_ptr) {
/* kernel subsystem misconfigured verifier */
verbose("invalid map_ptr to access map->value\n");
return -EACCES;
}
- err = check_stack_boundary(env, regno, (*mapp)->value_size);
+ if (type == PTR_TO_PACKET)
+ err = check_packet_access(env, regno, 0,
+ meta->map_ptr->value_size);
+ else
+ err = check_stack_boundary(env, regno,
+ meta->map_ptr->value_size,
+ false, NULL);
+ } else if (arg_type == ARG_CONST_STACK_SIZE ||
+ arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
+ bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
- } else if (arg_type == ARG_CONST_STACK_SIZE) {
/* bpf_xxx(..., buf, len) call will access 'len' bytes
* from stack pointer 'buf'. Check it
* note: regno == len, regno - 1 == buf
@@ -944,10 +1109,18 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
return -EACCES;
}
- err = check_stack_boundary(env, regno - 1, reg->imm);
+ if (regs[regno - 1].type == PTR_TO_PACKET)
+ err = check_packet_access(env, regno - 1, 0, reg->imm);
+ else
+ err = check_stack_boundary(env, regno - 1, reg->imm,
+ zero_size_allowed, meta);
}
return err;
+err_type:
+ verbose("R%d type=%s expected=%s\n", regno,
+ reg_type_str[type], reg_type_str[expected_type]);
+ return -EACCES;
}
static int check_map_func_compatibility(struct bpf_map *map, int func_id)
@@ -966,6 +1139,15 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
func_id != BPF_FUNC_perf_event_output)
goto error;
break;
+ case BPF_MAP_TYPE_STACK_TRACE:
+ if (func_id != BPF_FUNC_get_stackid)
+ goto error;
+ break;
+ case BPF_MAP_TYPE_CGROUP_ARRAY:
+ if (func_id != BPF_FUNC_skb_under_cgroup &&
+ func_id != BPF_FUNC_current_task_under_cgroup)
+ goto error;
+ break;
default:
break;
}
@@ -981,6 +1163,15 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
goto error;
break;
+ case BPF_FUNC_get_stackid:
+ if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
+ goto error;
+ break;
+ case BPF_FUNC_current_task_under_cgroup:
+ case BPF_FUNC_skb_under_cgroup:
+ if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
+ goto error;
+ break;
default:
break;
}
@@ -992,13 +1183,55 @@ error:
return -EINVAL;
}
-static int check_call(struct verifier_env *env, int func_id, int insn_idx)
+static int check_raw_mode(const struct bpf_func_proto *fn)
+{
+ int count = 0;
+
+ if (fn->arg1_type == ARG_PTR_TO_RAW_STACK)
+ count++;
+ if (fn->arg2_type == ARG_PTR_TO_RAW_STACK)
+ count++;
+ if (fn->arg3_type == ARG_PTR_TO_RAW_STACK)
+ count++;
+ if (fn->arg4_type == ARG_PTR_TO_RAW_STACK)
+ count++;
+ if (fn->arg5_type == ARG_PTR_TO_RAW_STACK)
+ count++;
+
+ return count > 1 ? -EINVAL : 0;
+}
+
+static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
- struct verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_reg_state *regs = state->regs, *reg;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ if (regs[i].type == PTR_TO_PACKET ||
+ regs[i].type == PTR_TO_PACKET_END)
+ mark_reg_unknown_value(regs, i);
+
+ for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+ if (state->stack_slot_type[i] != STACK_SPILL)
+ continue;
+ reg = &state->spilled_regs[i / BPF_REG_SIZE];
+ if (reg->type != PTR_TO_PACKET &&
+ reg->type != PTR_TO_PACKET_END)
+ continue;
+ reg->type = UNKNOWN_VALUE;
+ reg->imm = 0;
+ }
+}
+
+static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
+{
+ struct bpf_verifier_state *state = &env->cur_state;
const struct bpf_func_proto *fn = NULL;
- struct reg_state *regs = state->regs;
- struct bpf_map *map = NULL;
- struct reg_state *reg;
+ struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *reg;
+ struct bpf_call_arg_meta meta;
+ bool changes_data;
int i, err;
/* find function prototype */
@@ -1021,30 +1254,53 @@ static int check_call(struct verifier_env *env, int func_id, int insn_idx)
return -EINVAL;
}
+ changes_data = bpf_helper_changes_skb_data(fn->func);
+
+ memset(&meta, 0, sizeof(meta));
+ meta.pkt_access = fn->pkt_access;
+
+ /* We only support one arg being in raw mode at the moment, which
+ * is sufficient for the helper functions we have right now.
+ */
+ err = check_raw_mode(fn);
+ if (err) {
+ verbose("kernel subsystem misconfigured func %d\n", func_id);
+ return err;
+ }
+
/* check args */
- err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map);
+ err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
if (err)
return err;
- err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map);
+ err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
if (err)
return err;
if (func_id == BPF_FUNC_tail_call) {
- if (map == NULL) {
+ if (meta.map_ptr == NULL) {
verbose("verifier bug\n");
return -EINVAL;
}
- env->insn_aux_data[insn_idx].map_ptr = map;
+ env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
}
- err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map);
+ err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
if (err)
return err;
- err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map);
+ err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta);
if (err)
return err;
- err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map);
+ err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta);
if (err)
return err;
+ /* Mark slots with STACK_MISC in case of raw mode, stack offset
+ * is inferred from register state.
+ */
+ for (i = 0; i < meta.access_size; i++) {
+ err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1);
+ if (err)
+ return err;
+ }
+
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
reg = regs + caller_saved[i];
@@ -1059,32 +1315,441 @@ static int check_call(struct verifier_env *env, int func_id, int insn_idx)
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0;
/* remember map_ptr, so that check_map_access()
* can check 'value_size' boundary of memory access
* to map element returned from bpf_map_lookup_elem()
*/
- if (map == NULL) {
+ if (meta.map_ptr == NULL) {
verbose("kernel subsystem misconfigured verifier\n");
return -EINVAL;
}
- regs[BPF_REG_0].map_ptr = map;
+ regs[BPF_REG_0].map_ptr = meta.map_ptr;
+ regs[BPF_REG_0].id = ++env->id_gen;
} else {
verbose("unknown return type %d of func %d\n",
fn->ret_type, func_id);
return -EINVAL;
}
- err = check_map_func_compatibility(map, func_id);
+ err = check_map_func_compatibility(meta.map_ptr, func_id);
if (err)
return err;
+ if (changes_data)
+ clear_all_pkt_pointers(env);
return 0;
}
+static int check_packet_ptr_add(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+ struct bpf_reg_state tmp_reg;
+ s32 imm;
+
+ if (BPF_SRC(insn->code) == BPF_K) {
+ /* pkt_ptr += imm */
+ imm = insn->imm;
+
+add_imm:
+ if (imm <= 0) {
+ verbose("addition of negative constant to packet pointer is not allowed\n");
+ return -EACCES;
+ }
+ if (imm >= MAX_PACKET_OFF ||
+ imm + dst_reg->off >= MAX_PACKET_OFF) {
+ verbose("constant %d is too large to add to packet pointer\n",
+ imm);
+ return -EACCES;
+ }
+ /* a constant was added to pkt_ptr.
+ * Remember it while keeping the same 'id'
+ */
+ dst_reg->off += imm;
+ } else {
+ if (src_reg->type == PTR_TO_PACKET) {
+ /* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */
+ tmp_reg = *dst_reg; /* save r7 state */
+ *dst_reg = *src_reg; /* copy pkt_ptr state r6 into r7 */
+ src_reg = &tmp_reg; /* pretend it's src_reg state */
+ /* if the checks below reject it, the copy won't matter,
+ * since we're rejecting the whole program. If all ok,
+ * then imm22 state will be added to r7
+ * and r7 will be pkt(id=0,off=22,r=62) while
+ * r6 will stay as pkt(id=0,off=0,r=62)
+ */
+ }
+
+ if (src_reg->type == CONST_IMM) {
+ /* pkt_ptr += reg where reg is known constant */
+ imm = src_reg->imm;
+ goto add_imm;
+ }
+ /* disallow pkt_ptr += reg
+ * if reg is not uknown_value with guaranteed zero upper bits
+ * otherwise pkt_ptr may overflow and addition will become
+ * subtraction which is not allowed
+ */
+ if (src_reg->type != UNKNOWN_VALUE) {
+ verbose("cannot add '%s' to ptr_to_packet\n",
+ reg_type_str[src_reg->type]);
+ return -EACCES;
+ }
+ if (src_reg->imm < 48) {
+ verbose("cannot add integer value with %lld upper zero bits to ptr_to_packet\n",
+ src_reg->imm);
+ return -EACCES;
+ }
+ /* dst_reg stays as pkt_ptr type and since some positive
+ * integer value was added to the pointer, increment its 'id'
+ */
+ dst_reg->id = ++env->id_gen;
+
+ /* something was added to pkt_ptr, set range and off to zero */
+ dst_reg->off = 0;
+ dst_reg->range = 0;
+ }
+ return 0;
+}
+
+static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+ u8 opcode = BPF_OP(insn->code);
+ s64 imm_log2;
+
+ /* for type == UNKNOWN_VALUE:
+ * imm > 0 -> number of zero upper bits
+ * imm == 0 -> don't track which is the same as all bits can be non-zero
+ */
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+
+ if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 &&
+ dst_reg->imm && opcode == BPF_ADD) {
+ /* dreg += sreg
+ * where both have zero upper bits. Adding them
+ * can only result making one more bit non-zero
+ * in the larger value.
+ * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
+ * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
+ */
+ dst_reg->imm = min(dst_reg->imm, src_reg->imm);
+ dst_reg->imm--;
+ return 0;
+ }
+ if (src_reg->type == CONST_IMM && src_reg->imm > 0 &&
+ dst_reg->imm && opcode == BPF_ADD) {
+ /* dreg += sreg
+ * where dreg has zero upper bits and sreg is const.
+ * Adding them can only result making one more bit
+ * non-zero in the larger value.
+ */
+ imm_log2 = __ilog2_u64((long long)src_reg->imm);
+ dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
+ dst_reg->imm--;
+ return 0;
+ }
+ /* all other cases non supported yet, just mark dst_reg */
+ dst_reg->imm = 0;
+ return 0;
+ }
+
+ /* sign extend 32-bit imm into 64-bit to make sure that
+ * negative values occupy bit 63. Note ilog2() would have
+ * been incorrect, since sizeof(insn->imm) == 4
+ */
+ imm_log2 = __ilog2_u64((long long)insn->imm);
+
+ if (dst_reg->imm && opcode == BPF_LSH) {
+ /* reg <<= imm
+ * if reg was a result of 2 byte load, then its imm == 48
+ * which means that upper 48 bits are zero and shifting this reg
+ * left by 4 would mean that upper 44 bits are still zero
+ */
+ dst_reg->imm -= insn->imm;
+ } else if (dst_reg->imm && opcode == BPF_MUL) {
+ /* reg *= imm
+ * if multiplying by 14 subtract 4
+ * This is conservative calculation of upper zero bits.
+ * It's not trying to special case insn->imm == 1 or 0 cases
+ */
+ dst_reg->imm -= imm_log2 + 1;
+ } else if (opcode == BPF_AND) {
+ /* reg &= imm */
+ dst_reg->imm = 63 - imm_log2;
+ } else if (dst_reg->imm && opcode == BPF_ADD) {
+ /* reg += imm */
+ dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
+ dst_reg->imm--;
+ } else if (opcode == BPF_RSH) {
+ /* reg >>= imm
+ * which means that after right shift, upper bits will be zero
+ * note that verifier already checked that
+ * 0 <= imm < 64 for shift insn
+ */
+ dst_reg->imm += insn->imm;
+ if (unlikely(dst_reg->imm > 64))
+ /* some dumb code did:
+ * r2 = *(u32 *)mem;
+ * r2 >>= 32;
+ * and all bits are zero now */
+ dst_reg->imm = 64;
+ } else {
+ /* all other alu ops, means that we don't know what will
+ * happen to the value, mark it with unknown number of zero bits
+ */
+ dst_reg->imm = 0;
+ }
+
+ if (dst_reg->imm < 0) {
+ /* all 64 bits of the register can contain non-zero bits
+ * and such value cannot be added to ptr_to_packet, since it
+ * may overflow, mark it as unknown to avoid further eval
+ */
+ dst_reg->imm = 0;
+ }
+ return 0;
+}
+
+static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+ u8 opcode = BPF_OP(insn->code);
+ s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm);
+
+ /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */
+ if (src_reg->imm > 0 && dst_reg->imm) {
+ switch (opcode) {
+ case BPF_ADD:
+ /* dreg += sreg
+ * where both have zero upper bits. Adding them
+ * can only result making one more bit non-zero
+ * in the larger value.
+ * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
+ * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
+ */
+ dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+ dst_reg->imm--;
+ break;
+ case BPF_AND:
+ /* dreg &= sreg
+ * AND can not extend zero bits only shrink
+ * Ex. 0x00..00ffffff
+ * & 0x0f..ffffffff
+ * ----------------
+ * 0x00..00ffffff
+ */
+ dst_reg->imm = max(src_reg->imm, 63 - imm_log2);
+ break;
+ case BPF_OR:
+ /* dreg |= sreg
+ * OR can only extend zero bits
+ * Ex. 0x00..00ffffff
+ * | 0x0f..ffffffff
+ * ----------------
+ * 0x0f..00ffffff
+ */
+ dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+ break;
+ case BPF_SUB:
+ case BPF_MUL:
+ case BPF_RSH:
+ case BPF_LSH:
+ /* These may be flushed out later */
+ default:
+ mark_reg_unknown_value(regs, insn->dst_reg);
+ }
+ } else {
+ mark_reg_unknown_value(regs, insn->dst_reg);
+ }
+
+ dst_reg->type = UNKNOWN_VALUE;
+ return 0;
+}
+
+static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+ u8 opcode = BPF_OP(insn->code);
+
+ if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE)
+ return evaluate_reg_imm_alu_unknown(env, insn);
+
+ /* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn.
+ * Don't care about overflow or negative values, just add them
+ */
+ if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K)
+ dst_reg->imm += insn->imm;
+ else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
+ src_reg->type == CONST_IMM)
+ dst_reg->imm += src_reg->imm;
+ else
+ mark_reg_unknown_value(regs, insn->dst_reg);
+ return 0;
+}
+
+static void check_reg_overflow(struct bpf_reg_state *reg)
+{
+ if (reg->max_value > BPF_REGISTER_MAX_RANGE)
+ reg->max_value = BPF_REGISTER_MAX_RANGE;
+ if (reg->min_value < BPF_REGISTER_MIN_RANGE ||
+ reg->min_value > BPF_REGISTER_MAX_RANGE)
+ reg->min_value = BPF_REGISTER_MIN_RANGE;
+}
+
+static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+ s64 min_val = BPF_REGISTER_MIN_RANGE;
+ u64 max_val = BPF_REGISTER_MAX_RANGE;
+ bool min_set = false, max_set = false;
+ u8 opcode = BPF_OP(insn->code);
+
+ dst_reg = &regs[insn->dst_reg];
+ if (BPF_SRC(insn->code) == BPF_X) {
+ check_reg_overflow(&regs[insn->src_reg]);
+ min_val = regs[insn->src_reg].min_value;
+ max_val = regs[insn->src_reg].max_value;
+
+ /* If the source register is a random pointer then the
+ * min_value/max_value values represent the range of the known
+ * accesses into that value, not the actual min/max value of the
+ * register itself. In this case we have to reset the reg range
+ * values so we know it is not safe to look at.
+ */
+ if (regs[insn->src_reg].type != CONST_IMM &&
+ regs[insn->src_reg].type != UNKNOWN_VALUE) {
+ min_val = BPF_REGISTER_MIN_RANGE;
+ max_val = BPF_REGISTER_MAX_RANGE;
+ }
+ } else if (insn->imm < BPF_REGISTER_MAX_RANGE &&
+ (s64)insn->imm > BPF_REGISTER_MIN_RANGE) {
+ min_val = max_val = insn->imm;
+ min_set = max_set = true;
+ }
+
+ /* We don't know anything about what was done to this register, mark it
+ * as unknown. Also, if both derived bounds came from signed/unsigned
+ * mixed compares and one side is unbounded, we cannot really do anything
+ * with them as boundaries cannot be trusted. Thus, arithmetic of two
+ * regs of such kind will get invalidated bounds on the dst side.
+ */
+ if ((min_val == BPF_REGISTER_MIN_RANGE &&
+ max_val == BPF_REGISTER_MAX_RANGE) ||
+ (BPF_SRC(insn->code) == BPF_X &&
+ ((min_val != BPF_REGISTER_MIN_RANGE &&
+ max_val == BPF_REGISTER_MAX_RANGE) ||
+ (min_val == BPF_REGISTER_MIN_RANGE &&
+ max_val != BPF_REGISTER_MAX_RANGE) ||
+ (dst_reg->min_value != BPF_REGISTER_MIN_RANGE &&
+ dst_reg->max_value == BPF_REGISTER_MAX_RANGE) ||
+ (dst_reg->min_value == BPF_REGISTER_MIN_RANGE &&
+ dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) &&
+ regs[insn->dst_reg].value_from_signed !=
+ regs[insn->src_reg].value_from_signed)) {
+ reset_reg_range_values(regs, insn->dst_reg);
+ return;
+ }
+
+ /* If one of our values was at the end of our ranges then we can't just
+ * do our normal operations to the register, we need to set the values
+ * to the min/max since they are undefined.
+ */
+ if (opcode != BPF_SUB) {
+ if (min_val == BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ if (max_val == BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ }
+
+ switch (opcode) {
+ case BPF_ADD:
+ if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value += min_val;
+ if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value += max_val;
+ break;
+ case BPF_SUB:
+ /* If one of our values was at the end of our ranges, then the
+ * _opposite_ value in the dst_reg goes to the end of our range.
+ */
+ if (min_val == BPF_REGISTER_MIN_RANGE)
+ dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ if (max_val == BPF_REGISTER_MAX_RANGE)
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value -= max_val;
+ if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value -= min_val;
+ break;
+ case BPF_MUL:
+ if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value *= min_val;
+ if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value *= max_val;
+ break;
+ case BPF_AND:
+ /* Disallow AND'ing of negative numbers, ain't nobody got time
+ * for that. Otherwise the minimum is 0 and the max is the max
+ * value we could AND against.
+ */
+ if (min_val < 0)
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ else
+ dst_reg->min_value = 0;
+ dst_reg->max_value = max_val;
+ break;
+ case BPF_LSH:
+ /* Gotta have special overflow logic here, if we're shifting
+ * more than MAX_RANGE then just assume we have an invalid
+ * range.
+ */
+ if (min_val > ilog2(BPF_REGISTER_MAX_RANGE))
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ else if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value <<= min_val;
+
+ if (max_val > ilog2(BPF_REGISTER_MAX_RANGE))
+ dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ else if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value <<= max_val;
+ break;
+ case BPF_RSH:
+ /* RSH by a negative number is undefined, and the BPF_RSH is an
+ * unsigned shift, so make the appropriate casts.
+ */
+ if (min_val < 0 || dst_reg->min_value < 0)
+ reset_reg_range_values(regs, insn->dst_reg);
+ else
+ dst_reg->min_value = (u64)(dst_reg->min_value) >> max_val;
+ if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value >>= min_val;
+ break;
+ default:
+ reset_reg_range_values(regs, insn->dst_reg);
+ break;
+ }
+
+ check_reg_overflow(dst_reg);
+}
+
/* check validity of 32-bit and 64-bit arithmetic operations */
-static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
+static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
u8 opcode = BPF_OP(insn->code);
int err;
@@ -1145,6 +1810,11 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
+ /* we are setting our register to something new, we need to
+ * reset its range values.
+ */
+ reset_reg_range_values(regs, insn->dst_reg);
+
if (BPF_SRC(insn->code) == BPF_X) {
if (BPF_CLASS(insn->code) == BPF_ALU64) {
/* case: R1 = R2
@@ -1157,16 +1827,23 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
insn->src_reg);
return -EACCES;
}
- regs[insn->dst_reg].type = UNKNOWN_VALUE;
- regs[insn->dst_reg].map_ptr = NULL;
+ mark_reg_unknown_value(regs, insn->dst_reg);
}
- } else if (BPF_CLASS(insn->code) == BPF_ALU64 ||
- insn->imm >= 0) {
+ } else {
/* case: R = imm
* remember the value we stored into this reg
*/
+ u64 imm;
+
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ imm = insn->imm;
+ else
+ imm = (u32)insn->imm;
+
regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = insn->imm;
+ regs[insn->dst_reg].imm = imm;
+ regs[insn->dst_reg].max_value = imm;
+ regs[insn->dst_reg].min_value = imm;
}
} else if (opcode > BPF_END) {
@@ -1175,8 +1852,6 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
} else { /* all other ALU ops: and, sub, xor, add, ... */
- bool stack_relative = false;
-
if (BPF_SRC(insn->code) == BPF_X) {
if (insn->imm != 0 || insn->off != 0) {
verbose("BPF_ALU uses reserved fields\n");
@@ -1219,11 +1894,68 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
}
}
+ /* check dest operand */
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ if (err)
+ return err;
+
+ dst_reg = &regs[insn->dst_reg];
+
+ /* first we want to adjust our ranges. */
+ adjust_reg_min_max_vals(env, insn);
+
/* pattern match 'bpf_add Rx, imm' instruction */
if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
- regs[insn->dst_reg].type == FRAME_PTR &&
- BPF_SRC(insn->code) == BPF_K) {
- stack_relative = true;
+ dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) {
+ dst_reg->type = PTR_TO_STACK;
+ dst_reg->imm = insn->imm;
+ return 0;
+ } else if (opcode == BPF_ADD &&
+ BPF_CLASS(insn->code) == BPF_ALU64 &&
+ dst_reg->type == PTR_TO_STACK &&
+ ((BPF_SRC(insn->code) == BPF_X &&
+ regs[insn->src_reg].type == CONST_IMM) ||
+ BPF_SRC(insn->code) == BPF_K)) {
+ if (BPF_SRC(insn->code) == BPF_X) {
+ /* check in case the register contains a big
+ * 64-bit value
+ */
+ if (regs[insn->src_reg].imm < -MAX_BPF_STACK ||
+ regs[insn->src_reg].imm > MAX_BPF_STACK) {
+ verbose("R%d value too big in R%d pointer arithmetic\n",
+ insn->src_reg, insn->dst_reg);
+ return -EACCES;
+ }
+ dst_reg->imm += regs[insn->src_reg].imm;
+ } else {
+ /* safe against overflow: addition of 32-bit
+ * numbers in 64-bit representation
+ */
+ dst_reg->imm += insn->imm;
+ }
+ if (dst_reg->imm > 0 || dst_reg->imm < -MAX_BPF_STACK) {
+ verbose("R%d out-of-bounds pointer arithmetic\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+ return 0;
+ } else if (opcode == BPF_ADD &&
+ BPF_CLASS(insn->code) == BPF_ALU64 &&
+ (dst_reg->type == PTR_TO_PACKET ||
+ (BPF_SRC(insn->code) == BPF_X &&
+ regs[insn->src_reg].type == PTR_TO_PACKET))) {
+ /* ptr_to_packet += K|X */
+ return check_packet_ptr_add(env, insn);
+ } else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
+ dst_reg->type == UNKNOWN_VALUE &&
+ env->allow_ptr_leaks) {
+ /* unknown += K|X */
+ return evaluate_reg_alu(env, insn);
+ } else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
+ dst_reg->type == CONST_IMM &&
+ env->allow_ptr_leaks) {
+ /* reg_imm += K|X */
+ return evaluate_reg_imm_alu(env, insn);
} else if (is_pointer_value(env, insn->dst_reg)) {
verbose("R%d pointer arithmetic prohibited\n",
insn->dst_reg);
@@ -1235,25 +1967,275 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
return -EACCES;
}
- /* check dest operand */
- err = check_reg_arg(regs, insn->dst_reg, DST_OP);
- if (err)
- return err;
+ /* If we did pointer math on a map value then just set it to our
+ * PTR_TO_MAP_VALUE_ADJ type so we can deal with any stores or
+ * loads to this register appropriately, otherwise just mark the
+ * register as unknown.
+ */
+ if (env->allow_ptr_leaks &&
+ BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&
+ (dst_reg->type == PTR_TO_MAP_VALUE ||
+ dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
+ dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
+ else
+ mark_reg_unknown_value(regs, insn->dst_reg);
+ }
- if (stack_relative) {
- regs[insn->dst_reg].type = PTR_TO_STACK;
- regs[insn->dst_reg].imm = insn->imm;
+ return 0;
+}
+
+static void find_good_pkt_pointers(struct bpf_verifier_state *state,
+ struct bpf_reg_state *dst_reg)
+{
+ struct bpf_reg_state *regs = state->regs, *reg;
+ int i;
+
+ /* LLVM can generate two kind of checks:
+ *
+ * Type 1:
+ *
+ * r2 = r3;
+ * r2 += 8;
+ * if (r2 > pkt_end) goto <handle exception>
+ * <access okay>
+ *
+ * Where:
+ * r2 == dst_reg, pkt_end == src_reg
+ * r2=pkt(id=n,off=8,r=0)
+ * r3=pkt(id=n,off=0,r=0)
+ *
+ * Type 2:
+ *
+ * r2 = r3;
+ * r2 += 8;
+ * if (pkt_end >= r2) goto <access okay>
+ * <handle exception>
+ *
+ * Where:
+ * pkt_end == dst_reg, r2 == src_reg
+ * r2=pkt(id=n,off=8,r=0)
+ * r3=pkt(id=n,off=0,r=0)
+ *
+ * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
+ * so that range of bytes [r3, r3 + 8) is safe to access.
+ */
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
+ /* keep the maximum range already checked */
+ regs[i].range = max(regs[i].range, dst_reg->off);
+
+ for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+ if (state->stack_slot_type[i] != STACK_SPILL)
+ continue;
+ reg = &state->spilled_regs[i / BPF_REG_SIZE];
+ if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
+ reg->range = max(reg->range, dst_reg->off);
+ }
+}
+
+/* Adjusts the register min/max values in the case that the dst_reg is the
+ * variable register that we are working on, and src_reg is a constant or we're
+ * simply doing a BPF_K check.
+ */
+static void reg_set_min_max(struct bpf_reg_state *true_reg,
+ struct bpf_reg_state *false_reg, u64 val,
+ u8 opcode)
+{
+ bool value_from_signed = true;
+ bool is_range = true;
+
+ switch (opcode) {
+ case BPF_JEQ:
+ /* If this is false then we know nothing Jon Snow, but if it is
+ * true then we know for sure.
+ */
+ true_reg->max_value = true_reg->min_value = val;
+ is_range = false;
+ break;
+ case BPF_JNE:
+ /* If this is true we know nothing Jon Snow, but if it is false
+ * we know the value for sure;
+ */
+ false_reg->max_value = false_reg->min_value = val;
+ is_range = false;
+ break;
+ case BPF_JGT:
+ value_from_signed = false;
+ /* fallthrough */
+ case BPF_JSGT:
+ if (true_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(true_reg, 0);
+ if (false_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(false_reg, 0);
+ if (opcode == BPF_JGT) {
+ /* Unsigned comparison, the minimum value is 0. */
+ false_reg->min_value = 0;
}
+ /* If this is false then we know the maximum val is val,
+ * otherwise we know the min val is val+1.
+ */
+ false_reg->max_value = val;
+ false_reg->value_from_signed = value_from_signed;
+ true_reg->min_value = val + 1;
+ true_reg->value_from_signed = value_from_signed;
+ break;
+ case BPF_JGE:
+ value_from_signed = false;
+ /* fallthrough */
+ case BPF_JSGE:
+ if (true_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(true_reg, 0);
+ if (false_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(false_reg, 0);
+ if (opcode == BPF_JGE) {
+ /* Unsigned comparison, the minimum value is 0. */
+ false_reg->min_value = 0;
+ }
+ /* If this is false then we know the maximum value is val - 1,
+ * otherwise we know the mimimum value is val.
+ */
+ false_reg->max_value = val - 1;
+ false_reg->value_from_signed = value_from_signed;
+ true_reg->min_value = val;
+ true_reg->value_from_signed = value_from_signed;
+ break;
+ default:
+ break;
}
- return 0;
+ check_reg_overflow(false_reg);
+ check_reg_overflow(true_reg);
+ if (is_range) {
+ if (__is_pointer_value(false, false_reg))
+ reset_reg_range_values(false_reg, 0);
+ if (__is_pointer_value(false, true_reg))
+ reset_reg_range_values(true_reg, 0);
+ }
+}
+
+/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
+ * is the variable reg.
+ */
+static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
+ struct bpf_reg_state *false_reg, u64 val,
+ u8 opcode)
+{
+ bool value_from_signed = true;
+ bool is_range = true;
+
+ switch (opcode) {
+ case BPF_JEQ:
+ /* If this is false then we know nothing Jon Snow, but if it is
+ * true then we know for sure.
+ */
+ true_reg->max_value = true_reg->min_value = val;
+ is_range = false;
+ break;
+ case BPF_JNE:
+ /* If this is true we know nothing Jon Snow, but if it is false
+ * we know the value for sure;
+ */
+ false_reg->max_value = false_reg->min_value = val;
+ is_range = false;
+ break;
+ case BPF_JGT:
+ value_from_signed = false;
+ /* fallthrough */
+ case BPF_JSGT:
+ if (true_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(true_reg, 0);
+ if (false_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(false_reg, 0);
+ if (opcode == BPF_JGT) {
+ /* Unsigned comparison, the minimum value is 0. */
+ true_reg->min_value = 0;
+ }
+ /*
+ * If this is false, then the val is <= the register, if it is
+ * true the register <= to the val.
+ */
+ false_reg->min_value = val;
+ false_reg->value_from_signed = value_from_signed;
+ true_reg->max_value = val - 1;
+ true_reg->value_from_signed = value_from_signed;
+ break;
+ case BPF_JGE:
+ value_from_signed = false;
+ /* fallthrough */
+ case BPF_JSGE:
+ if (true_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(true_reg, 0);
+ if (false_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(false_reg, 0);
+ if (opcode == BPF_JGE) {
+ /* Unsigned comparison, the minimum value is 0. */
+ true_reg->min_value = 0;
+ }
+ /* If this is false then constant < register, if it is true then
+ * the register < constant.
+ */
+ false_reg->min_value = val + 1;
+ false_reg->value_from_signed = value_from_signed;
+ true_reg->max_value = val;
+ true_reg->value_from_signed = value_from_signed;
+ break;
+ default:
+ break;
+ }
+
+ check_reg_overflow(false_reg);
+ check_reg_overflow(true_reg);
+ if (is_range) {
+ if (__is_pointer_value(false, false_reg))
+ reset_reg_range_values(false_reg, 0);
+ if (__is_pointer_value(false, true_reg))
+ reset_reg_range_values(true_reg, 0);
+ }
+}
+
+static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
+ enum bpf_reg_type type)
+{
+ struct bpf_reg_state *reg = &regs[regno];
+
+ if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
+ reg->type = type;
+ /* We don't need id from this point onwards anymore, thus we
+ * should better reset it, so that state pruning has chances
+ * to take effect.
+ */
+ reg->id = 0;
+ if (type == UNKNOWN_VALUE)
+ __mark_reg_unknown_value(regs, regno);
+ }
+}
+
+/* The logic is similar to find_good_pkt_pointers(), both could eventually
+ * be folded together at some point.
+ */
+static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
+ enum bpf_reg_type type)
+{
+ struct bpf_reg_state *regs = state->regs;
+ u32 id = regs[regno].id;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ mark_map_reg(regs, i, id, type);
+
+ for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+ if (state->stack_slot_type[i] != STACK_SPILL)
+ continue;
+ mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, type);
+ }
}
-static int check_cond_jmp_op(struct verifier_env *env,
+static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_insn *insn, int *insn_idx)
{
- struct reg_state *regs = env->cur_state.regs;
- struct verifier_state *other_branch;
+ struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state;
+ struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
u8 opcode = BPF_OP(insn->code);
int err;
@@ -1290,11 +2272,12 @@ static int check_cond_jmp_op(struct verifier_env *env,
if (err)
return err;
+ dst_reg = &regs[insn->dst_reg];
+
/* detect if R == 0 where R was initialized to zero earlier */
if (BPF_SRC(insn->code) == BPF_K &&
(opcode == BPF_JEQ || opcode == BPF_JNE) &&
- regs[insn->dst_reg].type == CONST_IMM &&
- regs[insn->dst_reg].imm == insn->imm) {
+ dst_reg->type == CONST_IMM && dst_reg->imm == insn->imm) {
if (opcode == BPF_JEQ) {
/* if (imm == imm) goto pc+off;
* only follow the goto, ignore fall-through
@@ -1314,46 +2297,48 @@ static int check_cond_jmp_op(struct verifier_env *env,
if (!other_branch)
return -EFAULT;
- /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
+ /* detect if we are comparing against a constant value so we can adjust
+ * our min/max values for our dst register.
+ */
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (regs[insn->src_reg].type == CONST_IMM)
+ reg_set_min_max(&other_branch->regs[insn->dst_reg],
+ dst_reg, regs[insn->src_reg].imm,
+ opcode);
+ else if (dst_reg->type == CONST_IMM)
+ reg_set_min_max_inv(&other_branch->regs[insn->src_reg],
+ &regs[insn->src_reg], dst_reg->imm,
+ opcode);
+ } else {
+ reg_set_min_max(&other_branch->regs[insn->dst_reg],
+ dst_reg, insn->imm, opcode);
+ }
+
+ /* detect if R == 0 where R is returned from bpf_map_lookup_elem() */
if (BPF_SRC(insn->code) == BPF_K &&
- insn->imm == 0 && (opcode == BPF_JEQ ||
- opcode == BPF_JNE) &&
- regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
- if (opcode == BPF_JEQ) {
- /* next fallthrough insn can access memory via
- * this register
- */
- regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
- /* branch targer cannot access it, since reg == 0 */
- other_branch->regs[insn->dst_reg].type = CONST_IMM;
- other_branch->regs[insn->dst_reg].imm = 0;
- } else {
- other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
- regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = 0;
- }
+ insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+ dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
+ /* Mark all identical map registers in each branch as either
+ * safe or unknown depending R == 0 or R != 0 conditional.
+ */
+ mark_map_regs(this_branch, insn->dst_reg,
+ opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE);
+ mark_map_regs(other_branch, insn->dst_reg,
+ opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE);
+ } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
+ dst_reg->type == PTR_TO_PACKET &&
+ regs[insn->src_reg].type == PTR_TO_PACKET_END) {
+ find_good_pkt_pointers(this_branch, dst_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
+ dst_reg->type == PTR_TO_PACKET_END &&
+ regs[insn->src_reg].type == PTR_TO_PACKET) {
+ find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
} else if (is_pointer_value(env, insn->dst_reg)) {
verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
return -EACCES;
- } else if (BPF_SRC(insn->code) == BPF_K &&
- (opcode == BPF_JEQ || opcode == BPF_JNE)) {
-
- if (opcode == BPF_JEQ) {
- /* detect if (R == imm) goto
- * and in the target state recognize that R = imm
- */
- other_branch->regs[insn->dst_reg].type = CONST_IMM;
- other_branch->regs[insn->dst_reg].imm = insn->imm;
- } else {
- /* detect if (R != imm) goto
- * and in the fall-through state recognize that R = imm
- */
- regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = insn->imm;
- }
}
if (log_level)
- print_verifier_state(env);
+ print_verifier_state(this_branch);
return 0;
}
@@ -1366,9 +2351,9 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
}
/* verify BPF_LD_IMM64 instruction */
-static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
+static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state.regs;
int err;
if (BPF_SIZE(insn->code) != BPF_DW) {
@@ -1384,9 +2369,19 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
- if (insn->src_reg == 0)
- /* generic move 64-bit immediate into a register */
+ if (insn->src_reg == 0) {
+ /* generic move 64-bit immediate into a register,
+ * only analyzer needs to collect the ld_imm value.
+ */
+ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+
+ if (!env->analyzer_ops)
+ return 0;
+
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = imm;
return 0;
+ }
/* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
@@ -1423,22 +2418,22 @@ static bool may_access_skb(enum bpf_prog_type type)
* Output:
* R0 - 8/16/32-bit skb data converted to cpu endianness
*/
-static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
+static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state.regs;
u8 mode = BPF_MODE(insn->code);
- struct reg_state *reg;
+ struct bpf_reg_state *reg;
int i, err;
if (!may_access_skb(env->prog->type)) {
- verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n");
+ verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
return -EINVAL;
}
if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
BPF_SIZE(insn->code) == BPF_DW ||
(mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
- verbose("BPF_LD_ABS uses reserved fields\n");
+ verbose("BPF_LD_[ABS|IND] uses reserved fields\n");
return -EINVAL;
}
@@ -1513,7 +2508,7 @@ enum {
BRANCH = 2,
};
-#define STATE_LIST_MARK ((struct verifier_state_list *) -1L)
+#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L)
static int *insn_stack; /* stack of insns to process */
static int cur_stack; /* current stack index */
@@ -1524,7 +2519,7 @@ static int *insn_state;
* w - next instruction
* e - edge
*/
-static int push_insn(int t, int w, int e, struct verifier_env *env)
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
{
if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
return 0;
@@ -1565,7 +2560,7 @@ static int push_insn(int t, int w, int e, struct verifier_env *env)
/* non-recursive depth-first-search to detect loops in BPF program
* loop == back-edge in directed graph
*/
-static int check_cfg(struct verifier_env *env)
+static int check_cfg(struct bpf_verifier_env *env)
{
struct bpf_insn *insns = env->prog->insnsi;
int insn_cnt = env->prog->len;
@@ -1602,6 +2597,8 @@ peek_stack:
goto peek_stack;
else if (ret < 0)
goto err_free;
+ if (t + 1 < insn_cnt)
+ env->explored_states[t + 1] = STATE_LIST_MARK;
} else if (opcode == BPF_JA) {
if (BPF_SRC(insns[t].code) != BPF_K) {
ret = -EINVAL;
@@ -1621,6 +2618,7 @@ peek_stack:
env->explored_states[t + 1] = STATE_LIST_MARK;
} else {
/* conditional jump with two edges */
+ env->explored_states[t] = STATE_LIST_MARK;
ret = push_insn(t, t + 1, FALLTHROUGH, env);
if (ret == 1)
goto peek_stack;
@@ -1669,6 +2667,59 @@ err_free:
return ret;
}
+/* the following conditions reduce the number of explored insns
+ * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
+ */
+static bool compare_ptrs_to_packet(struct bpf_reg_state *old,
+ struct bpf_reg_state *cur)
+{
+ if (old->id != cur->id)
+ return false;
+
+ /* old ptr_to_packet is more conservative, since it allows smaller
+ * range. Ex:
+ * old(off=0,r=10) is equal to cur(off=0,r=20), because
+ * old(off=0,r=10) means that with range=10 the verifier proceeded
+ * further and found no issues with the program. Now we're in the same
+ * spot with cur(off=0,r=20), so we're safe too, since anything further
+ * will only be looking at most 10 bytes after this pointer.
+ */
+ if (old->off == cur->off && old->range < cur->range)
+ return true;
+
+ /* old(off=20,r=10) is equal to cur(off=22,re=22 or 5 or 0)
+ * since both cannot be used for packet access and safe(old)
+ * pointer has smaller off that could be used for further
+ * 'if (ptr > data_end)' check
+ * Ex:
+ * old(off=20,r=10) and cur(off=22,r=22) and cur(off=22,r=0) mean
+ * that we cannot access the packet.
+ * The safe range is:
+ * [ptr, ptr + range - off)
+ * so whenever off >=range, it means no safe bytes from this pointer.
+ * When comparing old->off <= cur->off, it means that older code
+ * went with smaller offset and that offset was later
+ * used to figure out the safe range after 'if (ptr > data_end)' check
+ * Say, 'old' state was explored like:
+ * ... R3(off=0, r=0)
+ * R4 = R3 + 20
+ * ... now R4(off=20,r=0) <-- here
+ * if (R4 > data_end)
+ * ... R4(off=20,r=20), R3(off=0,r=20) and R3 can be used to access.
+ * ... the code further went all the way to bpf_exit.
+ * Now the 'cur' state at the mark 'here' has R4(off=30,r=0).
+ * old_R4(off=20,r=0) equal to cur_R4(off=30,r=0), since if the verifier
+ * goes further, such cur_R4 will give larger safe packet range after
+ * 'if (R4 > data_end)' and all further insn were already good with r=20,
+ * so they will be good with r=30 and we can prune the search.
+ */
+ if (old->off <= cur->off &&
+ old->off >= old->range && cur->off >= cur->range)
+ return true;
+
+ return false;
+}
+
/* compare two verifier states
*
* all states stored in state_list are known to be valid, since
@@ -1695,19 +2746,49 @@ err_free:
* whereas register type in current state is meaningful, it means that
* the current state will reach 'bpf_exit' instruction safely
*/
-static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
+static bool states_equal(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur)
{
+ bool varlen_map_access = env->varlen_map_value_access;
+ struct bpf_reg_state *rold, *rcur;
int i;
for (i = 0; i < MAX_BPF_REG; i++) {
- if (memcmp(&old->regs[i], &cur->regs[i],
- sizeof(old->regs[0])) != 0) {
- if (old->regs[i].type == NOT_INIT ||
- (old->regs[i].type == UNKNOWN_VALUE &&
- cur->regs[i].type != NOT_INIT))
- continue;
- return false;
- }
+ rold = &old->regs[i];
+ rcur = &cur->regs[i];
+
+ if (memcmp(rold, rcur, sizeof(*rold)) == 0)
+ continue;
+
+ /* If the ranges were not the same, but everything else was and
+ * we didn't do a variable access into a map then we are a-ok.
+ */
+ if (!varlen_map_access &&
+ memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0)
+ continue;
+
+ /* If we didn't map access then again we don't care about the
+ * mismatched range values and it's ok if our old type was
+ * UNKNOWN and we didn't go to a NOT_INIT'ed or pointer reg.
+ */
+ if (rold->type == NOT_INIT ||
+ (!varlen_map_access && rold->type == UNKNOWN_VALUE &&
+ rcur->type != NOT_INIT &&
+ !__is_pointer_value(env->allow_ptr_leaks, rcur)))
+ continue;
+
+ /* Don't care about the reg->id in this case. */
+ if (rold->type == PTR_TO_MAP_VALUE_OR_NULL &&
+ rcur->type == PTR_TO_MAP_VALUE_OR_NULL &&
+ rold->map_ptr == rcur->map_ptr)
+ continue;
+
+ if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
+ compare_ptrs_to_packet(rold, rcur))
+ continue;
+
+ return false;
}
for (i = 0; i < MAX_BPF_STACK; i++) {
@@ -1729,9 +2810,9 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
* the same, check that stored pointers types
* are the same as well.
* Ex: explored safe path could have stored
- * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -8}
* but current path has stored:
- * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .imm = -16}
* such verifier states are not equivalent.
* return false to continue verification of this path
*/
@@ -1742,10 +2823,10 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
return true;
}
-static int is_state_visited(struct verifier_env *env, int insn_idx)
+static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
- struct verifier_state_list *new_sl;
- struct verifier_state_list *sl;
+ struct bpf_verifier_state_list *new_sl;
+ struct bpf_verifier_state_list *sl;
sl = env->explored_states[insn_idx];
if (!sl)
@@ -1755,7 +2836,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx)
return 0;
while (sl != STATE_LIST_MARK) {
- if (states_equal(&sl->state, &env->cur_state))
+ if (states_equal(env, &sl->state, &env->cur_state))
/* reached equivalent register/stack state,
* prune the search
*/
@@ -1769,7 +2850,7 @@ static int is_state_visited(struct verifier_env *env, int insn_idx)
* it will be rejected. Since there are no loops, we won't be
* seeing this 'insn_idx' instruction again on the way to bpf_exit
*/
- new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER);
+ new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER);
if (!new_sl)
return -ENOMEM;
@@ -1780,11 +2861,20 @@ static int is_state_visited(struct verifier_env *env, int insn_idx)
return 0;
}
-static int do_check(struct verifier_env *env)
+static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
+ int insn_idx, int prev_insn_idx)
{
- struct verifier_state *state = &env->cur_state;
+ if (!env->analyzer_ops || !env->analyzer_ops->insn_hook)
+ return 0;
+
+ return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx);
+}
+
+static int do_check(struct bpf_verifier_env *env)
+{
+ struct bpf_verifier_state *state = &env->cur_state;
struct bpf_insn *insns = env->prog->insnsi;
- struct reg_state *regs = state->regs;
+ struct bpf_reg_state *regs = state->regs;
int insn_cnt = env->prog->len;
int insn_idx, prev_insn_idx = 0;
int insn_processed = 0;
@@ -1792,6 +2882,7 @@ static int do_check(struct verifier_env *env)
init_reg_state(regs);
insn_idx = 0;
+ env->varlen_map_value_access = false;
for (;;) {
struct bpf_insn *insn;
u8 class;
@@ -1806,7 +2897,7 @@ static int do_check(struct verifier_env *env)
insn = &insns[insn_idx];
class = BPF_CLASS(insn->code);
- if (++insn_processed > 32768) {
+ if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
verbose("BPF program is too large. Proccessed %d insn\n",
insn_processed);
return -E2BIG;
@@ -1827,9 +2918,15 @@ static int do_check(struct verifier_env *env)
goto process_bpf_exit;
}
+ if (signal_pending(current))
+ return -EAGAIN;
+
+ if (need_resched())
+ cond_resched();
+
if (log_level && do_print_state) {
verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
- print_verifier_state(env);
+ print_verifier_state(&env->cur_state);
do_print_state = false;
}
@@ -1838,6 +2935,10 @@ static int do_check(struct verifier_env *env)
print_bpf_insn(env, insn);
}
+ err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
+ if (err)
+ return err;
+
env->insn_aux_data[insn_idx].seen = true;
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
@@ -1869,6 +2970,7 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
+ reset_reg_range_values(regs, insn->dst_reg);
if (BPF_SIZE(insn->code) != BPF_W &&
BPF_SIZE(insn->code) != BPF_DW) {
insn_idx++;
@@ -2046,6 +3148,7 @@ process_bpf_exit:
verbose("invalid BPF_LD mode\n");
return -EINVAL;
}
+ reset_reg_range_values(regs, insn->dst_reg);
} else {
verbose("unknown insn class %d\n", class);
return -EINVAL;
@@ -2054,17 +3157,32 @@ process_bpf_exit:
insn_idx++;
}
+ verbose("processed %d insns\n", insn_processed);
+ return 0;
+}
+
+static int check_map_prog_compatibility(struct bpf_map *map,
+ struct bpf_prog *prog)
+
+{
+ if (prog->type == BPF_PROG_TYPE_PERF_EVENT &&
+ (map->map_type == BPF_MAP_TYPE_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_HASH) &&
+ (map->map_flags & BPF_F_NO_PREALLOC)) {
+ verbose("perf_event programs can only use preallocated hash map\n");
+ return -EINVAL;
+ }
return 0;
}
/* look for pseudo eBPF instructions that access map FDs and
* replace them with actual map pointers
*/
-static int replace_map_fd_with_map_ptr(struct verifier_env *env)
+static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
- int i, j;
+ int i, j, err;
for (i = 0; i < insn_cnt; i++, insn++) {
if (BPF_CLASS(insn->code) == BPF_LDX &&
@@ -2108,6 +3226,12 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
return PTR_ERR(map);
}
+ err = check_map_prog_compatibility(map, env->prog);
+ if (err) {
+ fdput(f);
+ return err;
+ }
+
/* store map pointer inside BPF_LD_IMM64 instruction */
insn[0].imm = (u32) (unsigned long) map;
insn[1].imm = ((u64) (unsigned long) map) >> 32;
@@ -2151,7 +3275,7 @@ next_insn:
}
/* drop refcnt of maps used by the rejected program */
-static void release_maps(struct verifier_env *env)
+static void release_maps(struct bpf_verifier_env *env)
{
int i;
@@ -2160,7 +3284,7 @@ static void release_maps(struct verifier_env *env)
}
/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
-static void convert_pseudo_ld_imm64(struct verifier_env *env)
+static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
@@ -2175,7 +3299,7 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
* insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
* [0, off) and [off, end) to new locations, so the patched range stays zero
*/
-static int adjust_insn_aux_data(struct verifier_env *env, u32 prog_len,
+static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
u32 off, u32 cnt)
{
struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
@@ -2196,7 +3320,7 @@ static int adjust_insn_aux_data(struct verifier_env *env, u32 prog_len,
return 0;
}
-static struct bpf_prog *bpf_patch_insn_data(struct verifier_env *env, u32 off,
+static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
const struct bpf_insn *patch, u32 len)
{
struct bpf_prog *new_prog;
@@ -2213,7 +3337,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct verifier_env *env, u32 off,
* branches that are dead at run time. Malicious programs can have dead code
* too. Therefore replace all dead at-run-time code with nops.
*/
-static void sanitize_dead_code(struct verifier_env *env)
+static void sanitize_dead_code(struct bpf_verifier_env *env)
{
struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
@@ -2231,21 +3355,37 @@ static void sanitize_dead_code(struct verifier_env *env)
/* convert load instructions that access fields of 'struct __sk_buff'
* into sequence of instructions that access fields of 'struct sk_buff'
*/
-static int convert_ctx_accesses(struct verifier_env *env)
+static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
- struct bpf_insn *insn = env->prog->insnsi;
+ const struct bpf_verifier_ops *ops = env->prog->aux->ops;
const int insn_cnt = env->prog->len;
- struct bpf_insn insn_buf[16];
+ struct bpf_insn insn_buf[16], *insn;
struct bpf_prog *new_prog;
enum bpf_access_type type;
- int i, delta = 0;
+ int i, cnt, delta = 0;
+
+ if (ops->gen_prologue) {
+ cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
+ env->prog);
+ if (cnt >= ARRAY_SIZE(insn_buf)) {
+ verbose("bpf verifier is misconfigured\n");
+ return -EINVAL;
+ } else if (cnt) {
+ new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ env->prog = new_prog;
+ delta += cnt - 1;
+ }
+ }
- if (!env->prog->aux->ops->convert_ctx_access)
+ if (!ops->convert_ctx_access)
return 0;
- for (i = 0; i < insn_cnt; i++, insn++) {
- u32 cnt;
+ insn = env->prog->insnsi + delta;
+ for (i = 0; i < insn_cnt; i++, insn++) {
if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
type = BPF_READ;
@@ -2286,9 +3426,8 @@ static int convert_ctx_accesses(struct verifier_env *env)
if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
continue;
- cnt = env->prog->aux->ops->
- convert_ctx_access(type, insn->dst_reg, insn->src_reg,
- insn->off, insn_buf, env->prog);
+ cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg,
+ insn->off, insn_buf, env->prog);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
verbose("bpf verifier is misconfigured\n");
return -EINVAL;
@@ -2312,7 +3451,7 @@ static int convert_ctx_accesses(struct verifier_env *env)
*
* this function is called after eBPF program passed verification
*/
-static int fixup_bpf_calls(struct verifier_env *env)
+static int fixup_bpf_calls(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
struct bpf_insn *insn = prog->insnsi;
@@ -2323,6 +3462,7 @@ static int fixup_bpf_calls(struct verifier_env *env)
struct bpf_map *map_ptr;
int i, cnt, delta = 0;
+
for (i = 0; i < insn_cnt; i++, insn++) {
if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
insn->code == (BPF_ALU | BPF_DIV | BPF_X)) {
@@ -2354,9 +3494,9 @@ static int fixup_bpf_calls(struct verifier_env *env)
* conditional branch in the interpeter for every normal
* call and to prevent accidental JITing by JIT compiler
* that doesn't support bpf_tail_call yet
- */
+ */
insn->imm = 0;
- insn->code |= BPF_X;
+ insn->code = BPF_JMP | BPF_TAIL_CALL;
/* instead of changing every JIT dealing with tail_call
* emit two extra insns:
@@ -2400,9 +3540,9 @@ static int fixup_bpf_calls(struct verifier_env *env)
return 0;
}
-static void free_states(struct verifier_env *env)
+static void free_states(struct bpf_verifier_env *env)
{
- struct verifier_state_list *sl, *sln;
+ struct bpf_verifier_state_list *sl, *sln;
int i;
if (!env->explored_states)
@@ -2425,16 +3565,16 @@ static void free_states(struct verifier_env *env)
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
{
char __user *log_ubuf = NULL;
- struct verifier_env *env;
+ struct bpf_verifier_env *env;
int ret = -EINVAL;
if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS)
return -E2BIG;
- /* 'struct verifier_env' can be global, but since it's not small,
+ /* 'struct bpf_verifier_env' can be global, but since it's not small,
* allocate/free it every time bpf_check() is called
*/
- env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+ env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
@@ -2476,7 +3616,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
goto skip_full_check;
env->explored_states = kcalloc(env->prog->len,
- sizeof(struct verifier_state_list *),
+ sizeof(struct bpf_verifier_state_list *),
GFP_USER);
ret = -ENOMEM;
if (!env->explored_states)
@@ -2554,3 +3694,54 @@ err_free_env:
kfree(env);
return ret;
}
+
+int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
+ void *priv)
+{
+ struct bpf_verifier_env *env;
+ int ret;
+
+ env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
+ if (!env)
+ return -ENOMEM;
+
+ env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
+ prog->len);
+ ret = -ENOMEM;
+ if (!env->insn_aux_data)
+ goto err_free_env;
+ env->prog = prog;
+ env->analyzer_ops = ops;
+ env->analyzer_priv = priv;
+
+ /* grab the mutex to protect few globals used by verifier */
+ mutex_lock(&bpf_verifier_lock);
+
+ log_level = 0;
+
+ env->explored_states = kcalloc(env->prog->len,
+ sizeof(struct bpf_verifier_state_list *),
+ GFP_KERNEL);
+ ret = -ENOMEM;
+ if (!env->explored_states)
+ goto skip_full_check;
+
+ ret = check_cfg(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
+
+ ret = do_check(env);
+
+skip_full_check:
+ while (pop_stack(env, NULL) >= 0);
+ free_states(env);
+
+ mutex_unlock(&bpf_verifier_lock);
+ vfree(env->insn_aux_data);
+err_free_env:
+ kfree(env);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(bpf_analyzer);