summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/bpf.h30
-rw-r--r--include/linux/trace_events.h43
-rw-r--r--include/trace/perf.h6
-rw-r--r--kernel/bpf/core.c81
-rw-r--r--kernel/events/core.c26
-rw-r--r--kernel/trace/bpf_trace.c82
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/trace/trace_syscalls.c8
-rw-r--r--kernel/trace/trace_uprobe.c3
9 files changed, 242 insertions, 43 deletions
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e62febba833c..438efe7ed053 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -252,18 +252,38 @@ struct bpf_prog_array {
struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
-#define BPF_PROG_RUN_ARRAY(array, ctx, func) \
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+ struct bpf_prog *old_prog);
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+ struct bpf_prog *exclude_prog,
+ struct bpf_prog *include_prog,
+ struct bpf_prog_array **new_array);
+
+#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null) \
({ \
- struct bpf_prog **_prog; \
+ struct bpf_prog **_prog, *__prog; \
+ struct bpf_prog_array *_array; \
u32 _ret = 1; \
rcu_read_lock(); \
- _prog = rcu_dereference(array)->progs; \
- for (; *_prog; _prog++) \
- _ret &= func(*_prog, ctx); \
+ _array = rcu_dereference(array); \
+ if (unlikely(check_non_null && !_array))\
+ goto _out; \
+ _prog = _array->progs; \
+ while ((__prog = READ_ONCE(*_prog))) { \
+ _ret &= func(__prog, ctx); \
+ _prog++; \
+ } \
+_out: \
rcu_read_unlock(); \
_ret; \
})
+#define BPF_PROG_RUN_ARRAY(array, ctx, func) \
+ __BPF_PROG_RUN_ARRAY(array, ctx, func, false)
+
+#define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func) \
+ __BPF_PROG_RUN_ARRAY(array, ctx, func, true)
+
#ifdef CONFIG_BPF_SYSCALL
DECLARE_PER_CPU(int, bpf_prog_active);
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index ca6c1a0542ad..17fe42e836b4 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -302,14 +302,37 @@ struct trace_event_call {
#ifdef CONFIG_PERF_EVENTS
int perf_refcount;
struct hlist_head __percpu *perf_events;
- struct bpf_prog *prog;
- struct perf_event *bpf_prog_owner;
+ struct bpf_prog_array __rcu *prog_array;
int (*perf_perm)(struct trace_event_call *,
struct perf_event *);
#endif
};
+#ifdef CONFIG_PERF_EVENTS
+static inline bool bpf_prog_array_valid(struct trace_event_call *call)
+{
+ /*
+ * This inline function checks whether call->prog_array
+ * is valid or not. The function is called in various places,
+ * outside rcu_read_lock/unlock, as a heuristic to speed up execution.
+ *
+ * If this function returns true, and later call->prog_array
+ * becomes false inside rcu_read_lock/unlock region,
+ * we bail out then. If this function return false,
+ * there is a risk that we might miss a few events if the checking
+ * were delayed until inside rcu_read_lock/unlock region and
+ * call->prog_array happened to become non-NULL then.
+ *
+ * Here, READ_ONCE() is used instead of rcu_access_pointer().
+ * rcu_access_pointer() requires the actual definition of
+ * "struct bpf_prog_array" while READ_ONCE() only needs
+ * a declaration of the same type.
+ */
+ return !!READ_ONCE(call->prog_array);
+}
+#endif
+
static inline const char *
trace_event_name(struct trace_event_call *call)
{
@@ -562,12 +585,23 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file,
}
#ifdef CONFIG_BPF_EVENTS
-unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
+int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
+void perf_event_detach_bpf_prog(struct perf_event *event);
#else
-static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
return 1;
}
+
+static inline int
+perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void perf_event_detach_bpf_prog(struct perf_event *event) { }
+
#endif
enum {
@@ -638,6 +672,7 @@ perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
{
perf_tp_event(type, count, raw_data, size, regs, head, rctx, task);
}
+
#endif
#endif /* _LINUX_TRACE_EVENT_H */
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 88de5c205e86..31b8d276a993 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -34,7 +34,6 @@ perf_trace_##call(void *__data, proto) \
struct trace_event_call *event_call = __data; \
struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
struct trace_event_raw_##call *entry; \
- struct bpf_prog *prog = event_call->prog; \
struct pt_regs *__regs; \
u64 __count = 1; \
struct task_struct *__task = NULL; \
@@ -46,8 +45,9 @@ perf_trace_##call(void *__data, proto) \
__data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
\
head = this_cpu_ptr(event_call->perf_events); \
- if (!prog && __builtin_constant_p(!__task) && !__task && \
- hlist_empty(head)) \
+ if (!bpf_prog_array_valid(event_call) && \
+ __builtin_constant_p(!__task) && !__task && \
+ hlist_empty(head)) \
return; \
\
__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d08285b32d8c..c459c2533271 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1060,6 +1060,20 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
+static unsigned int __bpf_prog_ret1(const struct sk_buff *ctx,
+ const struct bpf_insn *insn)
+{
+ return 1;
+}
+
+static struct bpf_prog_dummy {
+ struct bpf_prog prog;
+} dummy_bpf_prog = {
+ .prog = {
+ .bpf_func = __bpf_prog_ret1,
+ },
+};
+
/* to avoid allocating empty bpf_prog_array for cgroups that
* don't have bpf program attached use one global 'empty_prog_array'
* It will not be modified the caller of bpf_prog_array_alloc()
@@ -1091,6 +1105,73 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
kfree_rcu(progs, rcu);
}
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_prog **prog = progs->progs;
+
+ for (; *prog; prog++)
+ if (*prog == old_prog) {
+ WRITE_ONCE(*prog, &dummy_bpf_prog.prog);
+ break;
+ }
+}
+
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+ struct bpf_prog *exclude_prog,
+ struct bpf_prog *include_prog,
+ struct bpf_prog_array **new_array)
+{
+ int new_prog_cnt, carry_prog_cnt = 0;
+ struct bpf_prog **existing_prog;
+ struct bpf_prog_array *array;
+ int new_prog_idx = 0;
+
+ /* Figure out how many existing progs we need to carry over to
+ * the new array.
+ */
+ if (old_array) {
+ existing_prog = old_array->progs;
+ for (; *existing_prog; existing_prog++) {
+ if (*existing_prog != exclude_prog &&
+ *existing_prog != &dummy_bpf_prog.prog)
+ carry_prog_cnt++;
+ if (*existing_prog == include_prog)
+ return -EEXIST;
+ }
+ }
+
+ /* How many progs (not NULL) will be in the new array? */
+ new_prog_cnt = carry_prog_cnt;
+ if (include_prog)
+ new_prog_cnt += 1;
+
+ /* Do we have any prog (not NULL) in the new array? */
+ if (!new_prog_cnt) {
+ *new_array = NULL;
+ return 0;
+ }
+
+ /* +1 as the end of prog_array is marked with NULL */
+ array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
+ if (!array)
+ return -ENOMEM;
+
+ /* Fill in the new prog array */
+ if (carry_prog_cnt) {
+ existing_prog = old_array->progs;
+ for (; *existing_prog; existing_prog++)
+ if (*existing_prog != exclude_prog &&
+ *existing_prog != &dummy_bpf_prog.prog)
+ array->progs[new_prog_idx++] = *existing_prog;
+ }
+ if (include_prog)
+ array->progs[new_prog_idx++] = include_prog;
+ array->progs[new_prog_idx] = NULL;
+ *new_array = array;
+ return 0;
+}
+
static void bpf_prog_free_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3c03da137864..470cb4cb8f74 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7203,11 +7203,9 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
struct pt_regs *regs, struct hlist_head *head,
struct task_struct *task)
{
- struct bpf_prog *prog = call->prog;
-
- if (prog) {
+ if (bpf_prog_array_valid(call)) {
*(struct pt_regs **)raw_data = regs;
- if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
+ if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
}
@@ -7410,6 +7408,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{
bool is_kprobe, is_tracepoint;
struct bpf_prog *prog;
+ int ret;
if (event->attr.type == PERF_TYPE_HARDWARE ||
event->attr.type == PERF_TYPE_SOFTWARE)
@@ -7418,9 +7417,6 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
if (event->attr.type != PERF_TYPE_TRACEPOINT)
return -EINVAL;
- if (event->tp_event->prog)
- return -EEXIST;
-
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
if (!is_kprobe && !is_tracepoint)
@@ -7446,26 +7442,20 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
return -EACCES;
}
}
- event->tp_event->prog = prog;
- event->tp_event->bpf_prog_owner = event;
- return 0;
+ ret = perf_event_attach_bpf_prog(event, prog);
+ if (ret)
+ bpf_prog_put(prog);
+ return ret;
}
static void perf_event_free_bpf_prog(struct perf_event *event)
{
- struct bpf_prog *prog;
-
if (event->attr.type != PERF_TYPE_TRACEPOINT) {
perf_event_free_bpf_handler(event);
return;
}
-
- prog = event->tp_event->prog;
- if (prog && event->tp_event->bpf_prog_owner == event) {
- event->tp_event->prog = NULL;
- bpf_prog_put(prog);
- }
+ perf_event_detach_bpf_prog(event);
}
#else
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a1cd54a16a99..045bde1a64ac 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -17,7 +17,7 @@
/**
* trace_call_bpf - invoke BPF program
- * @prog: BPF program
+ * @call: tracepoint event
* @ctx: opaque context pointer
*
* kprobe handlers execute BPF programs via this helper.
@@ -29,7 +29,7 @@
* 1 - store kprobe event into ring buffer
* Other values are reserved and currently alias to 1
*/
-unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
unsigned int ret;
@@ -49,9 +49,22 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
goto out;
}
- rcu_read_lock();
- ret = BPF_PROG_RUN(prog, ctx);
- rcu_read_unlock();
+ /*
+ * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
+ * to all call sites, we did a bpf_prog_array_valid() there to check
+ * whether call->prog_array is empty or not, which is
+ * a heurisitc to speed up execution.
+ *
+ * If bpf_prog_array_valid() fetched prog_array was
+ * non-NULL, we go into trace_call_bpf() and do the actual
+ * proper rcu_dereference() under RCU lock.
+ * If it turns out that prog_array is NULL then, we bail out.
+ * For the opposite, if the bpf_prog_array_valid() fetched pointer
+ * was NULL, you'll skip the prog_array with the risk of missing
+ * out of events when it was updated in between this and the
+ * rcu_dereference() which is accepted risk.
+ */
+ ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
out:
__this_cpu_dec(bpf_prog_active);
@@ -604,6 +617,65 @@ static const struct bpf_verifier_ops perf_event_prog_ops = {
.convert_ctx_access = pe_prog_convert_ctx_access,
};
+static DEFINE_MUTEX(bpf_event_mutex);
+
+int perf_event_attach_bpf_prog(struct perf_event *event,
+ struct bpf_prog *prog)
+{
+ struct bpf_prog_array __rcu *old_array;
+ struct bpf_prog_array *new_array;
+ int ret = -EEXIST;
+
+ mutex_lock(&bpf_event_mutex);
+
+ if (event->prog)
+ goto out;
+
+ old_array = rcu_dereference_protected(event->tp_event->prog_array,
+ lockdep_is_held(&bpf_event_mutex));
+ ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+ if (ret < 0)
+ goto out;
+
+ /* set the new array to event->tp_event and set event->prog */
+ event->prog = prog;
+ rcu_assign_pointer(event->tp_event->prog_array, new_array);
+ bpf_prog_array_free(old_array);
+
+out:
+ mutex_unlock(&bpf_event_mutex);
+ return ret;
+}
+
+void perf_event_detach_bpf_prog(struct perf_event *event)
+{
+ struct bpf_prog_array __rcu *old_array;
+ struct bpf_prog_array *new_array;
+ int ret;
+
+ mutex_lock(&bpf_event_mutex);
+
+ if (!event->prog)
+ goto out;
+
+ old_array = rcu_dereference_protected(event->tp_event->prog_array,
+ lockdep_is_held(&bpf_event_mutex));
+
+ ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+ if (ret < 0) {
+ bpf_prog_array_delete_safe(old_array, event->prog);
+ } else {
+ rcu_assign_pointer(event->tp_event->prog_array, new_array);
+ bpf_prog_array_free(old_array);
+ }
+
+ bpf_prog_put(event->prog);
+ event->prog = NULL;
+
+out:
+ mutex_unlock(&bpf_event_mutex);
+}
+
static struct bpf_prog_type_list perf_event_tl = {
.ops = &perf_event_prog_ops,
.type = BPF_PROG_TYPE_PERF_EVENT,
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 975638e51be5..5db1a3eec84d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1129,13 +1129,12 @@ static void
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
struct trace_event_call *call = &tk->tp.call;
- struct bpf_prog *prog = call->prog;
struct kprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
- if (prog && !trace_call_bpf(prog, regs))
+ if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
return;
head = this_cpu_ptr(call->perf_events);
@@ -1165,13 +1164,12 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
struct trace_event_call *call = &tk->tp.call;
- struct bpf_prog *prog = call->prog;
struct kretprobe_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
int rctx;
- if (prog && !trace_call_bpf(prog, regs))
+ if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
return;
head = this_cpu_ptr(call->perf_events);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index d7cfb6a673e3..e8540540ecab 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -551,6 +551,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec;
struct hlist_head *head;
+ bool valid_prog_array;
int syscall_nr;
int rctx;
int size;
@@ -566,7 +567,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
return;
head = this_cpu_ptr(sys_data->enter_event->perf_events);
- if (hlist_empty(head))
+ valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
+ if (!valid_prog_array && hlist_empty(head))
return;
/* get the size after alignment with the u32 buffer size field */
@@ -626,6 +628,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec;
struct hlist_head *head;
+ bool valid_prog_array;
int syscall_nr;
int rctx;
int size;
@@ -641,7 +644,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
return;
head = this_cpu_ptr(sys_data->exit_event->perf_events);
- if (hlist_empty(head))
+ valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
+ if (!valid_prog_array && hlist_empty(head))
return;
/* We can probably do that at build time */
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index e77dfa1e7db7..d86cc7d768f1 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1119,13 +1119,12 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
{
struct trace_event_call *call = &tu->tp.call;
struct uprobe_trace_entry_head *entry;
- struct bpf_prog *prog = call->prog;
struct hlist_head *head;
void *data;
int size, esize;
int rctx;
- if (prog && !trace_call_bpf(prog, regs))
+ if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
return;
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));